Remove timeouts

Remove timeouts due to several reasons:

1. Lost purpose: after implementing the containers page streaming the
   only subprocess we have left is LibreOffice. So don't have such a
   big risk of commands hanging (the original reason for timeouts).

2. Little benefit: predicting execution time is generically unsolvable
   computer science problem. Ultimately we were guessing an arbitrary
   time based on the number of pages and the document size. As a guess
   we made it pretty lax (30s per page or MB). A document hanging for
   this long will probably lead to user frustration in any case and the
   user may be compelled to abort the conversion.

3. Technical Challenges with non-blocking timeout: there have been
several technical challenges in keeping timeouts that we've made effort
to accommodate. A significant one was having to do non-blocking read to
ensure we could timeout when reading conversion stream (and then used
here)

Fixes #687
This commit is contained in:
deeplow 2024-01-25 11:34:11 +00:00
parent 4d3f2b32c7
commit 69c2a02d81
No known key found for this signature in database
GPG key ID: 577982871529A52A
10 changed files with 32 additions and 115 deletions

View file

@ -21,6 +21,10 @@ since 0.4.1, and this project adheres to [Semantic Versioning](https://semver.or
- Feature: Add support for HWP/HWPX files (Hancom Office) for macOS Apple Silicon devices ([issue #498](https://github.com/freedomofpress/dangerzone/issues/498), thanks to [@OctopusET](https://github.com/OctopusET))
- Replace Dangerzone document rendering engine from pdftoppm PyMuPDF, essentially replacing a variety of tools (gm / tesseract / pdfunite / ps2pdf) ([issue #658](https://github.com/freedomofpress/dangerzone/issues/658))
### Removed
- Removed timeouts ([issue #687](https://github.com/freedomofpress/dangerzone/issues/687))
## Dangerzone 0.5.1
### Fixed

View file

@ -37,12 +37,6 @@ def print_header(s: str) -> None:
@click.option(
"--unsafe-dummy-conversion", "dummy_conversion", flag_value=True, hidden=True
)
@click.option(
"--enable-timeouts / --disable-timeouts",
default=True,
show_default=True,
help="Enable/Disable timeouts during document conversion",
)
@click.argument(
"filenames",
required=True,
@ -55,7 +49,6 @@ def print_header(s: str) -> None:
def cli_main(
output_filename: Optional[str],
ocr_lang: Optional[str],
enable_timeouts: bool,
filenames: List[str],
archive: bool,
dummy_conversion: bool,
@ -67,7 +60,7 @@ def cli_main(
elif is_qubes_native_conversion():
dangerzone = DangerzoneCore(Qubes())
else:
dangerzone = DangerzoneCore(Container(enable_timeouts=enable_timeouts))
dangerzone = DangerzoneCore(Container())
display_banner()
if len(filenames) == 1 and output_filename:

View file

@ -12,10 +12,8 @@ import time
from abc import abstractmethod
from typing import Callable, Dict, List, Optional, TextIO, Tuple, Union
TIMEOUT_PER_PAGE: float = 30 # (seconds)
TIMEOUT_PER_MB: float = 30 # (seconds)
TIMEOUT_MIN: float = 60 # (seconds)
DEFAULT_DPI = 150 # Pixels per inch
INT_BYTES = 2
def running_on_qubes() -> bool:
@ -23,28 +21,6 @@ def running_on_qubes() -> bool:
return os.path.exists("/usr/share/qubes/marker-vm")
def calculate_timeout(size: float, pages: Optional[float] = None) -> float:
"""Calculate the timeout for a command.
The timeout calculation takes two factors in mind:
1. The size (in MiBs) of the dataset (document, multiple pages).
2. The number of pages in the dataset.
It then calculates proportional timeout values based on the above, and keeps the
large one. This way, we can handle several corner cases:
* Documents with lots of pages, but small file size.
* Single images with large file size.
"""
# Do not have timeouts lower than 10 seconds, if the file size is small, since
# we need to take into account the program's startup time as well.
timeout = max(TIMEOUT_PER_MB * size, TIMEOUT_MIN)
if pages:
timeout = max(timeout, TIMEOUT_PER_PAGE * pages)
return timeout
def get_tessdata_dir() -> str:
if running_on_qubes():
return "/usr/share/tesseract/tessdata/"
@ -76,7 +52,7 @@ class DangerzoneConverter:
@classmethod
def _write_int(cls, num: int, file: TextIO = sys.stdout) -> None:
cls._write_bytes(num.to_bytes(2, "big", signed=False), file=file)
cls._write_bytes(num.to_bytes(INT_BYTES, "big", signed=False), file=file)
# ==== ASYNC METHODS ====
# We run sync methods in async wrappers, because pure async methods are more difficult:
@ -127,8 +103,6 @@ class DangerzoneConverter:
args: List[str],
*,
error_message: str,
timeout_message: Optional[str] = None,
timeout: Optional[float] = None,
stdout_callback: Optional[Callable] = None,
stderr_callback: Optional[Callable] = None,
) -> Tuple[bytes, bytes]:
@ -138,7 +112,6 @@ class DangerzoneConverter:
output in bytes.
:raises RuntimeError: if the process returns a non-zero exit status
:raises TimeoutError: if the process times out
"""
# Start the provided command, and return a handle. The command will run in the
# background.
@ -164,12 +137,9 @@ class DangerzoneConverter:
self.read_stream(proc.stderr, stderr_callback)
)
# Wait until the command has finished, for a specific timeout. Then, verify that the
# command has completed successfully. In any other case, raise an exception.
try:
ret = await asyncio.wait_for(proc.wait(), timeout=timeout)
except asyncio.exceptions.TimeoutError:
raise TimeoutError(timeout_message)
# Wait until the command has finished. Then, verify that the command
# has completed successfully. In any other case, raise an exception.
ret = await proc.wait()
if ret != 0:
raise RuntimeError(error_message)
@ -179,15 +149,6 @@ class DangerzoneConverter:
stderr = await stderr_task
return (stdout, stderr)
def calculate_timeout(
self, size: float, pages: Optional[float] = None
) -> Optional[float]:
"""Calculate the timeout for a command."""
if not int(os.environ.get("ENABLE_TIMEOUTS", 1)):
return None
return calculate_timeout(size, pages)
@abstractmethod
async def convert(self) -> None:
pass

View file

@ -49,7 +49,6 @@ class PixelsToPDF(DangerzoneConverter):
# The first few operations happen on a per-page basis.
page_size = len(untrusted_rgb_data)
total_size += page_size
timeout = self.calculate_timeout(page_size, 1)
pixmap = fitz.Pixmap(
fitz.Colorspace(fitz.CS_RGB), width, height, untrusted_rgb_data, False
)
@ -75,10 +74,6 @@ class PixelsToPDF(DangerzoneConverter):
safe_doc.insert_pdf(fitz.open("pdf", page_pdf_bytes))
self.percentage += percentage_per_page
# Next operations apply to the all the pages, so we need to recalculate the
# timeout.
timeout = self.calculate_timeout(total_size, num_pages)
self.percentage = 100.0
self.update_progress("Safe PDF created")
@ -110,7 +105,7 @@ async def main() -> int:
try:
await converter.convert(ocr_lang)
return 0
except (RuntimeError, TimeoutError, ValueError) as e:
except (RuntimeError, ValueError) as e:
converter.update_progress(str(e), error=True)
return 1

View file

@ -96,12 +96,6 @@ class Application(QtWidgets.QApplication):
@click.option(
"--unsafe-dummy-conversion", "dummy_conversion", flag_value=True, hidden=True
)
@click.option(
"--enable-timeouts / --disable-timeouts",
default=True,
show_default=True,
help="Enable/Disable timeouts during document conversion",
)
@click.argument(
"filenames",
required=False,
@ -111,9 +105,7 @@ class Application(QtWidgets.QApplication):
)
@click.version_option(version=get_version(), message="%(version)s")
@errors.handle_document_errors
def gui_main(
dummy_conversion: bool, filenames: Optional[List[str]], enable_timeouts: bool
) -> bool:
def gui_main(dummy_conversion: bool, filenames: Optional[List[str]]) -> bool:
setup_logging()
if platform.system() == "Darwin":
@ -138,7 +130,7 @@ def gui_main(
qubes = Qubes()
dangerzone = DangerzoneGui(app, isolation_provider=qubes)
else:
container = Container(enable_timeouts=enable_timeouts)
container = Container()
dangerzone = DangerzoneGui(app, isolation_provider=container)
# Allow Ctrl-C to smoothly quit the program instead of throwing an exception

View file

@ -10,9 +10,9 @@ from typing import IO, Callable, Optional
from colorama import Fore, Style
from ..conversion import errors
from ..conversion.common import calculate_timeout
from ..conversion.common import INT_BYTES
from ..document import Document
from ..util import Stopwatch, nonblocking_read, replace_control_chars
from ..util import replace_control_chars
log = logging.getLogger(__name__)
@ -23,27 +23,26 @@ PIXELS_TO_PDF_LOG_START = "----- PIXELS TO PDF LOG START -----"
PIXELS_TO_PDF_LOG_END = "----- PIXELS TO PDF LOG END -----"
def read_bytes(f: IO[bytes], size: int, timeout: float, exact: bool = True) -> bytes:
def read_bytes(f: IO[bytes], size: int, exact: bool = True) -> bytes:
"""Read bytes from a file-like object."""
buf = nonblocking_read(f, size, timeout)
buf = f.read(size)
if exact and len(buf) != size:
raise errors.InterruptedConversionException()
return buf
def read_int(f: IO[bytes], timeout: float) -> int:
def read_int(f: IO[bytes]) -> int:
"""Read 2 bytes from a file-like object, and decode them as int."""
untrusted_int = read_bytes(f, 2, timeout)
if len(untrusted_int) != 2:
untrusted_int = f.read(INT_BYTES)
if len(untrusted_int) != INT_BYTES:
raise errors.InterruptedConversionException()
return int.from_bytes(untrusted_int, "big", signed=False)
def read_debug_text(f: IO[bytes], size: int) -> str:
"""Read arbitrarily long text (for debug purposes)"""
timeout = calculate_timeout(size)
untrusted_text = read_bytes(f, size, timeout, exact=False)
return untrusted_text.decode("ascii", errors="replace")
untrusted_text = f.read(size).decode("ascii", errors="replace")
return replace_control_chars(untrusted_text)
class IsolationProvider(ABC):
@ -51,8 +50,6 @@ class IsolationProvider(ABC):
Abstracts an isolation provider
"""
STARTUP_TIME_SECONDS = 0 # The maximum time it takes a the provider to start up.
def __init__(self) -> None:
if getattr(sys, "dangerzone_dev", False) == True:
self.proc_stderr = subprocess.PIPE
@ -105,28 +102,18 @@ class IsolationProvider(ABC):
except BrokenPipeError as e:
raise errors.ConverterProcException(p)
# Get file size (in MiB)
size = os.path.getsize(document.input_filename) / 1024**2
timeout = calculate_timeout(size) + self.STARTUP_TIME_SECONDS
assert p is not None
assert p.stdout is not None
os.set_blocking(p.stdout.fileno(), False)
n_pages = read_int(p.stdout, timeout)
assert p.stdout
n_pages = read_int(p.stdout)
if n_pages == 0 or n_pages > errors.MAX_PAGES:
raise errors.MaxPagesException()
percentage_per_page = 50.0 / n_pages
timeout = calculate_timeout(size, n_pages)
sw = Stopwatch(timeout)
sw.start()
for page in range(1, n_pages + 1):
text = f"Converting page {page}/{n_pages} to pixels"
self.print_progress(document, False, text, percentage)
width = read_int(p.stdout, timeout=sw.remaining)
height = read_int(p.stdout, timeout=sw.remaining)
width = read_int(p.stdout)
height = read_int(p.stdout)
if not (1 <= width <= errors.MAX_PAGE_WIDTH):
raise errors.MaxPageWidthException()
if not (1 <= height <= errors.MAX_PAGE_HEIGHT):
@ -136,7 +123,6 @@ class IsolationProvider(ABC):
untrusted_pixels = read_bytes(
p.stdout,
num_pixels,
timeout=sw.remaining,
)
# Wrapper code
@ -157,8 +143,7 @@ class IsolationProvider(ABC):
self.print_progress(document, False, text, percentage)
if getattr(sys, "dangerzone_dev", False):
assert p.stderr is not None
os.set_blocking(p.stderr.fileno(), False)
assert p.stderr
untrusted_log = read_debug_text(p.stderr, MAX_CONVERSION_LOG_CHARS)
p.stderr.close()
log.info(

View file

@ -33,11 +33,6 @@ class NoContainerTechException(Exception):
class Container(IsolationProvider):
# Name of the dangerzone container
CONTAINER_NAME = "dangerzone.rocks/dangerzone"
STARTUP_TIME_SECONDS = 5
def __init__(self, enable_timeouts: bool) -> None:
self.enable_timeouts = 1 if enable_timeouts else 0
super().__init__()
@staticmethod
def get_runtime_name() -> str:
@ -227,8 +222,6 @@ class Container(IsolationProvider):
f"OCR={0 if ocr_lang is None else 1}",
"-e",
f"OCR_LANGUAGE={ocr_lang}",
"-e",
f"ENABLE_TIMEOUTS={self.enable_timeouts}",
]
pixels_to_pdf_proc = self.exec_container(command, extra_args)
@ -256,14 +249,10 @@ class Container(IsolationProvider):
"-m",
"dangerzone.conversion.doc_to_pixels",
]
extra_args = [
"-e",
f"ENABLE_TIMEOUTS={self.enable_timeouts}",
]
return self.exec_container(command, extra_args)
return self.exec_container(command)
def get_max_parallel_conversions(self) -> int:
# FIXME hardcoded 1 until timeouts are more limited and better handled
# FIXME hardcoded 1 until length conversions are better handled
# https://github.com/freedomofpress/dangerzone/issues/257
return 1

View file

@ -23,8 +23,6 @@ log = logging.getLogger(__name__)
class Qubes(IsolationProvider):
"""Uses a disposable qube for performing the conversion"""
STARTUP_TIME_SECONDS = 5 * 60 # 5 minutes
def install(self) -> bool:
return True
@ -37,7 +35,7 @@ class Qubes(IsolationProvider):
converter = PixelsToPDF(progress_callback=print_progress_wrapper)
try:
asyncio.run(converter.convert(ocr_lang, tempdir))
except (RuntimeError, TimeoutError, ValueError) as e:
except (RuntimeError, ValueError) as e:
raise errors.UnexpectedConversionError(str(e))
finally:
if getattr(sys, "dangerzone_dev", False):

View file

@ -22,7 +22,7 @@ from .base import IsolationProviderTest
@pytest.fixture
def provider() -> Container:
return Container(enable_timeouts=False)
return Container()
class TestContainer(IsolationProviderTest):

View file

@ -49,7 +49,7 @@ def test_ocr_ommisions() -> None:
installed_langs -= {"osd", "equ"}
# Grab the languages that Dangerzone offers to the user through the GUI/CLI.
offered_langs = set(DangerzoneCore(Container(True)).ocr_languages.values())
offered_langs = set(DangerzoneCore(Container()).ocr_languages.values())
# Ensure that both the installed languages and the ones we offer to the user are the
# same.