Remove dead code

This commit is contained in:
Alex Pyrgiotis 2024-03-14 10:46:10 +02:00
parent f42bb23229
commit 7ea7c8a0cc
No known key found for this signature in database
GPG key ID: B6C15EBA0357C9AA
8 changed files with 16 additions and 448 deletions

View file

@ -21,30 +21,6 @@ RUN case "$ARCH" in \
RUN pip install -vv --break-system-packages --require-hashes -r /tmp/requirements.txt RUN pip install -vv --break-system-packages --require-hashes -r /tmp/requirements.txt
###########################################
# Download Tesseract data
FROM alpine:latest as tessdata-dl
ARG TESSDATA_CHECKSUM=d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9
# Download the trained models from the latest GitHub release of Tesseract, and
# store them under /usr/share/tessdata. This is basically what distro packages
# do under the hood.
#
# Because the GitHub release contains more files than just the trained models,
# we use `find` to fetch only the '*.traineddata' files in the top directory.
#
# Before we untar the models, we also check if the checksum is the expected one.
RUN mkdir /usr/share/tessdata/ && mkdir tessdata && cd tessdata \
&& TESSDATA_VERSION=$(wget -O- -nv https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest \
| sed -n 's/^.*"tag_name": "\([0-9.]\+\)".*$/\1/p') \
&& wget https://github.com/tesseract-ocr/tessdata_fast/archive/$TESSDATA_VERSION/tessdata_fast-$TESSDATA_VERSION.tar.gz \
&& echo "$TESSDATA_CHECKSUM tessdata_fast-$TESSDATA_VERSION.tar.gz" | sha256sum -c \
&& tar -xzvf tessdata_fast-$TESSDATA_VERSION.tar.gz -C . \
&& find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata/ \; \
&& cd .. && rm -r tessdata
########################################### ###########################################
# Download H2ORestart # Download H2ORestart
FROM alpine:latest as h2orestart-dl FROM alpine:latest as h2orestart-dl
@ -74,7 +50,6 @@ RUN apk --no-cache -U upgrade && \
COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/fitz/ /usr/lib/python3.12/site-packages/fitz COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/fitz/ /usr/lib/python3.12/site-packages/fitz
COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/pymupdf/ /usr/lib/python3.12/site-packages/pymupdf COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/pymupdf/ /usr/lib/python3.12/site-packages/pymupdf
COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/PyMuPDF.libs/ /usr/lib/python3.12/site-packages/PyMuPDF.libs COPY --from=pymupdf-build /usr/lib/python3.12/site-packages/PyMuPDF.libs/ /usr/lib/python3.12/site-packages/PyMuPDF.libs
COPY --from=tessdata-dl /usr/share/tessdata/ /usr/share/tessdata
COPY --from=h2orestart-dl /libreoffice_ext/ /libreoffice_ext COPY --from=h2orestart-dl /libreoffice_ext/ /libreoffice_ext
RUN install -dm777 "/usr/lib/libreoffice/share/extensions/" RUN install -dm777 "/usr/lib/libreoffice/share/extensions/"

View file

@ -13,15 +13,6 @@ def running_on_qubes() -> bool:
return os.path.exists("/usr/share/qubes/marker-vm") return os.path.exists("/usr/share/qubes/marker-vm")
def get_tessdata_dir() -> str:
if os.environ.get("TESSDATA_PREFIX"):
return os.environ["TESSDATA_PREFIX"]
elif running_on_qubes():
return "/usr/share/tesseract/tessdata/"
else:
return "/usr/share/tessdata/"
class DangerzoneConverter: class DangerzoneConverter:
def __init__(self, progress_callback: Optional[Callable] = None) -> None: def __init__(self, progress_callback: Optional[Callable] = None) -> None:
self.percentage: float = 0.0 self.percentage: float = 0.0

View file

@ -1,161 +0,0 @@
"""
Here are the steps, with progress bar percentages:
- 50%-95%: Convert each page of pixels into a PDF (each page takes 45/n%, where n is the number of pages)
- 95%-100%: Compress the final PDF
"""
import asyncio
import contextlib
import glob
import io
import json
import os
import sys
from typing import Optional
from .common import DEFAULT_DPI, DangerzoneConverter, get_tessdata_dir, running_on_qubes
# XXX: PyMUPDF logs to stdout by default [1]. The PyMuPDF devs provide a way [2] to log to
# stderr, but it's based on environment variables. These envvars are consulted at import
# time [3], so we have to set them here, before we import `fitz`.
#
# [1] https://github.com/freedomofpress/dangerzone/issues/877
# [2] https://github.com/pymupdf/PyMuPDF/issues/3135#issuecomment-1992625724
# [3] https://github.com/pymupdf/PyMuPDF/blob/9717935eeb2d50d15440d62575878214226795f9/src/__init__.py#L62-L63
os.environ["PYMUPDF_MESSAGE"] = "fd:2"
os.environ["PYMUPDF_LOG"] = "fd:2"
class PixelsToPDF(DangerzoneConverter):
async def convert(
self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = None
) -> None:
self.percentage = 50.0
if tempdir is None:
tempdir = "/safezone"
# XXX lazy loading of fitz module to avoid import issues on non-Qubes systems
import fitz
num_pages = len(glob.glob(f"{tempdir}/pixels/page-*.rgb"))
total_size = 0.0
safe_doc = fitz.Document()
# Convert RGB files to PDF files
percentage_per_page = 45.0 / num_pages
for page_num in range(1, num_pages + 1):
filename_base = f"{tempdir}/pixels/page-{page_num}"
rgb_filename = f"{filename_base}.rgb"
width_filename = f"{filename_base}.width"
height_filename = f"{filename_base}.height"
with open(width_filename) as f:
width = int(f.read().strip())
with open(height_filename) as f:
height = int(f.read().strip())
with open(rgb_filename, "rb") as rgb_f:
untrusted_rgb_data = rgb_f.read()
# The first few operations happen on a per-page basis.
page_size = len(untrusted_rgb_data)
total_size += page_size
pixmap = fitz.Pixmap(
fitz.Colorspace(fitz.CS_RGB),
width,
height,
untrusted_rgb_data,
False,
)
pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI)
if ocr_lang: # OCR the document
self.update_progress(
f"Converting page {page_num}/{num_pages} from pixels to searchable PDF"
)
if int(fitz.version[2]) >= 20230621000001:
page_pdf_bytes = pixmap.pdfocr_tobytes(
compress=True,
language=ocr_lang,
tessdata=get_tessdata_dir(),
)
else:
# XXX: In PyMuPDF v1.22.5, the function signature of
# `pdfocr_tobytes()` / `pdfocr_save()` was extended with an argument
# to explicitly set the Tesseract data dir [1].
#
# In earlier versions, the PyMuPDF developers recommend setting this
# path via the TESSDATA_PREFIX environment variable. In practice,
# this environment variable is read at import time, so subsequent
# changes to the environment variable are not tracked [2].
#
# To make things worse, any attempt to alter the internal attribute
# (`fitz.TESSDATA_PREFIX`) makes no difference as well, when using
# the OCR functions. That's due to the way imports work in `fitz`,
# where somehow the internal `fitz.fitz` module is shadowed.
#
# A hacky solution is to grab the `fitz.fitz` module from
# `sys.modules`, and set there the TESSDATA_PREFIX variable. We can
# get away with this hack because we have a proper solution for
# subsequent PyMuPDF versions, and we know that nothing will change
# in older versions.
#
# TODO: Remove after oldest distro has PyMuPDF >= v1.22.5
#
# [1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_save
# [2]: https://github.com/pymupdf/PyMuPDF/blob/0368e56cfa6afb55bcf6c726e7f51a2a16a5ccba/fitz/fitz.i#L308
sys.modules["fitz.fitz"].TESSDATA_PREFIX = get_tessdata_dir() # type: ignore [attr-defined]
page_pdf_bytes = pixmap.pdfocr_tobytes(
compress=True,
language=ocr_lang,
)
ocr_pdf = fitz.open("pdf", page_pdf_bytes)
else: # Don't OCR
self.update_progress(
f"Converting page {page_num}/{num_pages} from pixels to PDF"
)
page_doc = fitz.Document()
page_doc.insert_file(pixmap)
page_pdf_bytes = page_doc.tobytes(deflate_images=True)
safe_doc.insert_pdf(fitz.open("pdf", page_pdf_bytes))
self.percentage += percentage_per_page
self.percentage = 100.0
self.update_progress("Safe PDF created")
# Move converted files into /safezone
if running_on_qubes():
safe_pdf_path = f"{tempdir}/safe-output-compressed.pdf"
else:
safe_pdf_path = f"/safezone/safe-output-compressed.pdf"
safe_doc.save(safe_pdf_path, deflate_images=True)
def update_progress(self, text: str, *, error: bool = False) -> None:
if running_on_qubes():
if self.progress_callback:
self.progress_callback(error, text, self.percentage)
else:
print(
json.dumps(
{"error": error, "text": text, "percentage": self.percentage}
)
)
sys.stdout.flush()
async def main() -> int:
ocr_lang = os.environ.get("OCR_LANGUAGE") if os.environ.get("OCR") == "1" else None
converter = PixelsToPDF()
try:
await converter.convert(ocr_lang)
return 0
except (RuntimeError, ValueError) as e:
converter.update_progress(str(e), error=True)
return 1
if __name__ == "__main__":
sys.exit(asyncio.run(main()))

View file

@ -220,12 +220,6 @@ class IsolationProvider(ABC):
text = "Successfully converted document" text = "Successfully converted document"
self.print_progress(document, False, text, 100) self.print_progress(document, False, text, 100)
@abstractmethod
def pixels_to_pdf(
self, document: Document, tempdir: str, ocr_lang: Optional[str]
) -> None:
pass
def print_progress( def print_progress(
self, document: Document, error: bool, text: str, percentage: float self, document: Document, error: bool, text: str, percentage: float
) -> None: ) -> None:
@ -352,74 +346,3 @@ class IsolationProvider(ABC):
f"{debug_log}" # no need for an extra newline here f"{debug_log}" # no need for an extra newline here
f"{DOC_TO_PIXELS_LOG_END}" f"{DOC_TO_PIXELS_LOG_END}"
) )
# From global_common:
# def validate_convert_to_pixel_output(self, common, output):
# """
# Take the output from the convert to pixels tasks and validate it. Returns
# a tuple like: (success (boolean), error_message (str))
# """
# max_image_width = 10000
# max_image_height = 10000
# # Did we hit an error?
# for line in output.split("\n"):
# if (
# "failed:" in line
# or "The document format is not supported" in line
# or "Error" in line
# ):
# return False, output
# # How many pages was that?
# num_pages = None
# for line in output.split("\n"):
# if line.startswith("Document has "):
# num_pages = line.split(" ")[2]
# break
# if not num_pages or not num_pages.isdigit() or int(num_pages) <= 0:
# return False, "Invalid number of pages returned"
# num_pages = int(num_pages)
# # Make sure we have the files we expect
# expected_filenames = []
# for i in range(1, num_pages + 1):
# expected_filenames += [
# f"page-{i}.rgb",
# f"page-{i}.width",
# f"page-{i}.height",
# ]
# expected_filenames.sort()
# actual_filenames = os.listdir(common.pixel_dir.name)
# actual_filenames.sort()
# if expected_filenames != actual_filenames:
# return (
# False,
# f"We expected these files:\n{expected_filenames}\n\nBut we got these files:\n{actual_filenames}",
# )
# # Make sure the files are the correct sizes
# for i in range(1, num_pages + 1):
# with open(f"{common.pixel_dir.name}/page-{i}.width") as f:
# w_str = f.read().strip()
# with open(f"{common.pixel_dir.name}/page-{i}.height") as f:
# h_str = f.read().strip()
# w = int(w_str)
# h = int(h_str)
# if (
# not w_str.isdigit()
# or not h_str.isdigit()
# or w <= 0
# or w > max_image_width
# or h <= 0
# or h > max_image_height
# ):
# return False, f"Page {i} has invalid geometry"
# # Make sure the RGB file is the correct size
# if os.path.getsize(f"{common.pixel_dir.name}/page-{i}.rgb") != w * h * 3:
# return False, f"Page {i} has an invalid RGB file size"
# return True, True

View file

@ -1,24 +1,15 @@
import gzip import gzip
import json
import logging import logging
import os import os
import platform import platform
import shlex import shlex
import shutil import shutil
import subprocess import subprocess
import sys from typing import List, Tuple
from typing import Any, List, Optional, Tuple
from ..conversion import errors
from ..document import Document from ..document import Document
from ..util import get_tmp_dir # NOQA : required for mocking in our tests.
from ..util import get_resource_path, get_subprocess_startupinfo from ..util import get_resource_path, get_subprocess_startupinfo
from .base import ( from .base import IsolationProvider, terminate_process_group
PIXELS_TO_PDF_LOG_END,
PIXELS_TO_PDF_LOG_START,
IsolationProvider,
terminate_process_group,
)
TIMEOUT_KILL = 5 # Timeout in seconds until the kill command returns. TIMEOUT_KILL = 5 # Timeout in seconds until the kill command returns.
@ -234,31 +225,6 @@ class Container(IsolationProvider):
"""Unique container name for the pixels-to-pdf phase.""" """Unique container name for the pixels-to-pdf phase."""
return f"dangerzone-pixels-to-pdf-{document.id}" return f"dangerzone-pixels-to-pdf-{document.id}"
def assert_field_type(self, val: Any, _type: object) -> None:
# XXX: Use a stricter check than isinstance because `bool` is a subclass of
# `int`.
#
# See https://stackoverflow.com/a/37888668
if type(val) is not _type:
raise ValueError("Status field has incorrect type")
def parse_progress_trusted(self, document: Document, line: str) -> None:
"""
Parses a line returned by the container.
"""
try:
status = json.loads(line)
text = status["text"]
self.assert_field_type(text, str)
error = status["error"]
self.assert_field_type(error, bool)
percentage = status["percentage"]
self.assert_field_type(percentage, float)
self.print_progress(document, error, text, percentage)
except Exception:
error_message = f"Invalid JSON returned from container:\n\n\t {line}"
self.print_progress(document, True, error_message, -1)
def exec( def exec(
self, self,
args: List[str], args: List[str],
@ -337,84 +303,6 @@ class Container(IsolationProvider):
f"Unexpected error occurred while killing container '{name}': {str(e)}" f"Unexpected error occurred while killing container '{name}': {str(e)}"
) )
def pixels_to_pdf(
self, document: Document, tempdir: str, ocr_lang: Optional[str]
) -> None:
# Convert pixels to safe PDF
command = [
"/usr/bin/python3",
"-m",
"dangerzone.conversion.pixels_to_pdf",
]
extra_args = [
"-v",
f"{tempdir}:/safezone:Z",
"-e",
f"OCR={0 if ocr_lang is None else 1}",
"-e",
f"OCR_LANGUAGE={ocr_lang}",
]
# XXX: Until #748 gets merged, we have to run our pixels to PDF phase in a
# container, which involves mounting two temp dirs. This does not bode well with
# gVisor for two reasons:
#
# 1. Our gVisor integration chroot()s into /home/dangerzone/dangerzone-image/rootfs,
# meaning that the location of the temp dirs must be relevant to that path.
# 2. Reading and writing to these temp dirs requires permissions which are not
# available to the user within gVisor's user namespace.
#
# For these reasons, and because the pixels to PDF phase is more trusted (and
# will soon stop being containerized), we circumvent gVisor support by doing the
# following:
#
# 1. Override our entrypoint script with a no-op command (/usr/bin/env).
# 2. Set the PYTHONPATH so that we can import the Python code within
# /home/dangerzone/dangerzone-image/rootfs
# 3. Run the container as the root user, so that it can always write to the
# mounted directories. This container is trusted, so running as root has no
# impact to the security of Dangerzone.
img_root = "/home/dangerzone/dangerzone-image/rootfs"
extra_args += [
"--entrypoint",
"/usr/bin/env",
"-e",
f"PYTHONPATH={img_root}/opt/dangerzone:{img_root}/usr/lib/python3.12/site-packages",
"-e",
f"TESSDATA_PREFIX={img_root}/usr/share/tessdata",
"-u",
"root",
]
name = self.pixels_to_pdf_container_name(document)
pixels_to_pdf_proc = self.exec_container(command, name, extra_args)
if pixels_to_pdf_proc.stdout:
for line in pixels_to_pdf_proc.stdout:
self.parse_progress_trusted(document, line.decode())
error_code = pixels_to_pdf_proc.wait()
# In case of a dev run, log everything from the second container.
if getattr(sys, "dangerzone_dev", False):
assert pixels_to_pdf_proc.stderr
out = pixels_to_pdf_proc.stderr.read().decode()
text = (
f"Conversion output: (pixels to PDF)\n"
f"{PIXELS_TO_PDF_LOG_START}\n{out}\n{PIXELS_TO_PDF_LOG_END}"
)
log.info(text)
if error_code != 0:
log.error("pixels-to-pdf failed")
raise errors.exception_from_error_code(error_code)
else:
# Move the final file to the right place
if os.path.exists(document.output_filename):
os.remove(document.output_filename)
container_output_filename = os.path.join(
tempdir, "safe-output-compressed.pdf"
)
shutil.move(container_output_filename, document.output_filename)
def start_doc_to_pixels_proc(self, document: Document) -> subprocess.Popen: def start_doc_to_pixels_proc(self, document: Document) -> subprocess.Popen:
# Convert document to pixels # Convert document to pixels
command = [ command = [

View file

@ -1,20 +1,16 @@
import asyncio
import io import io
import logging import logging
import os import os
import shutil
import subprocess import subprocess
import sys import sys
import zipfile import zipfile
from pathlib import Path from pathlib import Path
from typing import IO, Optional from typing import IO
from ..conversion import errors
from ..conversion.common import running_on_qubes from ..conversion.common import running_on_qubes
from ..conversion.pixels_to_pdf import PixelsToPDF
from ..document import Document from ..document import Document
from ..util import get_resource_path from ..util import get_resource_path
from .base import PIXELS_TO_PDF_LOG_END, PIXELS_TO_PDF_LOG_START, IsolationProvider from .base import IsolationProvider
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -25,28 +21,6 @@ class Qubes(IsolationProvider):
def install(self) -> bool: def install(self) -> bool:
return True return True
def pixels_to_pdf(
self, document: Document, tempdir: str, ocr_lang: Optional[str]
) -> None:
def print_progress_wrapper(error: bool, text: str, percentage: float) -> None:
self.print_progress(document, error, text, percentage)
converter = PixelsToPDF(progress_callback=print_progress_wrapper)
try:
asyncio.run(converter.convert(ocr_lang, tempdir))
except (RuntimeError, ValueError) as e:
raise errors.UnexpectedConversionError(str(e))
finally:
if getattr(sys, "dangerzone_dev", False):
out = converter.captured_output.decode()
text = (
f"Conversion output: (pixels to PDF)\n"
f"{PIXELS_TO_PDF_LOG_START}\n{out}{PIXELS_TO_PDF_LOG_END}"
)
log.info(text)
shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename)
def get_max_parallel_conversions(self) -> int: def get_max_parallel_conversions(self) -> int:
return 1 return 1

View file

@ -3,7 +3,6 @@ import platform
import subprocess import subprocess
import sys import sys
import unicodedata import unicodedata
from typing import Optional
import appdirs import appdirs
@ -12,17 +11,6 @@ def get_config_dir() -> str:
return appdirs.user_config_dir("dangerzone") return appdirs.user_config_dir("dangerzone")
def get_tmp_dir() -> Optional[str]:
"""Get the parent dir for the Dangerzone temporary dirs.
This function returns the parent directory where Dangerzone will store its temporary
directories. The default behavior is to let Python choose for us (e.g., in `/tmp`
for Linux), which is why we return None. However, we still need to define this
function in order to be able to set this dir via mocking in our tests.
"""
return None
def get_resource_path(filename: str) -> str: def get_resource_path(filename: str) -> str:
if getattr(sys, "dangerzone_dev", False): if getattr(sys, "dangerzone_dev", False):
# Look for resources directory relative to python file # Look for resources directory relative to python file

View file

@ -134,8 +134,6 @@ class TestCli:
if os.environ.get("DUMMY_CONVERSION", False): if os.environ.get("DUMMY_CONVERSION", False):
args = ("--unsafe-dummy-conversion", *args) args = ("--unsafe-dummy-conversion", *args)
with tempfile.TemporaryDirectory() as t:
tmp_dir = Path(t)
# TODO: Replace this with `contextlib.chdir()` [1], which was added in # TODO: Replace this with `contextlib.chdir()` [1], which was added in
# Python 3.11. # Python 3.11.
# #
@ -145,19 +143,11 @@ class TestCli:
cwd = os.getcwd() cwd = os.getcwd()
os.chdir(tmp_path) os.chdir(tmp_path)
with mock.patch(
"dangerzone.isolation_provider.container.get_tmp_dir",
return_value=t,
):
result = CliRunner().invoke(cli_main, args) result = CliRunner().invoke(cli_main, args)
finally: finally:
if tmp_path is not None: if tmp_path is not None:
os.chdir(cwd) os.chdir(cwd)
if tmp_dir.exists():
stale_files = list(tmp_dir.iterdir())
assert not stale_files
# XXX Print stdout so that junitXML exports with output capturing # XXX Print stdout so that junitXML exports with output capturing
# actually include the stdout + stderr (they are combined into stdout) # actually include the stdout + stderr (they are combined into stdout)
print(result.stdout) print(result.stdout)