mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00
Remove timeouts
Remove timeouts due to several reasons: 1. Lost purpose: after implementing the containers page streaming the only subprocess we have left is LibreOffice. So don't have such a big risk of commands hanging (the original reason for timeouts). 2. Little benefit: predicting execution time is generically unsolvable computer science problem. Ultimately we were guessing an arbitrary time based on the number of pages and the document size. As a guess we made it pretty lax (30s per page or MB). A document hanging for this long will probably lead to user frustration in any case and the user may be compelled to abort the conversion. 3. Technical Challenges with non-blocking timeout: there have been several technical challenges in keeping timeouts that we've made effort to accommodate. A significant one was having to do non-blocking read to ensure we could timeout when reading conversion stream (and then used here) Fixes #687
This commit is contained in:
parent
4d3f2b32c7
commit
69c2a02d81
10 changed files with 32 additions and 115 deletions
|
@ -21,6 +21,10 @@ since 0.4.1, and this project adheres to [Semantic Versioning](https://semver.or
|
|||
- Feature: Add support for HWP/HWPX files (Hancom Office) for macOS Apple Silicon devices ([issue #498](https://github.com/freedomofpress/dangerzone/issues/498), thanks to [@OctopusET](https://github.com/OctopusET))
|
||||
- Replace Dangerzone document rendering engine from pdftoppm PyMuPDF, essentially replacing a variety of tools (gm / tesseract / pdfunite / ps2pdf) ([issue #658](https://github.com/freedomofpress/dangerzone/issues/658))
|
||||
|
||||
### Removed
|
||||
|
||||
- Removed timeouts ([issue #687](https://github.com/freedomofpress/dangerzone/issues/687))
|
||||
|
||||
## Dangerzone 0.5.1
|
||||
|
||||
### Fixed
|
||||
|
|
|
@ -37,12 +37,6 @@ def print_header(s: str) -> None:
|
|||
@click.option(
|
||||
"--unsafe-dummy-conversion", "dummy_conversion", flag_value=True, hidden=True
|
||||
)
|
||||
@click.option(
|
||||
"--enable-timeouts / --disable-timeouts",
|
||||
default=True,
|
||||
show_default=True,
|
||||
help="Enable/Disable timeouts during document conversion",
|
||||
)
|
||||
@click.argument(
|
||||
"filenames",
|
||||
required=True,
|
||||
|
@ -55,7 +49,6 @@ def print_header(s: str) -> None:
|
|||
def cli_main(
|
||||
output_filename: Optional[str],
|
||||
ocr_lang: Optional[str],
|
||||
enable_timeouts: bool,
|
||||
filenames: List[str],
|
||||
archive: bool,
|
||||
dummy_conversion: bool,
|
||||
|
@ -67,7 +60,7 @@ def cli_main(
|
|||
elif is_qubes_native_conversion():
|
||||
dangerzone = DangerzoneCore(Qubes())
|
||||
else:
|
||||
dangerzone = DangerzoneCore(Container(enable_timeouts=enable_timeouts))
|
||||
dangerzone = DangerzoneCore(Container())
|
||||
|
||||
display_banner()
|
||||
if len(filenames) == 1 and output_filename:
|
||||
|
|
|
@ -12,10 +12,8 @@ import time
|
|||
from abc import abstractmethod
|
||||
from typing import Callable, Dict, List, Optional, TextIO, Tuple, Union
|
||||
|
||||
TIMEOUT_PER_PAGE: float = 30 # (seconds)
|
||||
TIMEOUT_PER_MB: float = 30 # (seconds)
|
||||
TIMEOUT_MIN: float = 60 # (seconds)
|
||||
DEFAULT_DPI = 150 # Pixels per inch
|
||||
INT_BYTES = 2
|
||||
|
||||
|
||||
def running_on_qubes() -> bool:
|
||||
|
@ -23,28 +21,6 @@ def running_on_qubes() -> bool:
|
|||
return os.path.exists("/usr/share/qubes/marker-vm")
|
||||
|
||||
|
||||
def calculate_timeout(size: float, pages: Optional[float] = None) -> float:
|
||||
"""Calculate the timeout for a command.
|
||||
|
||||
The timeout calculation takes two factors in mind:
|
||||
|
||||
1. The size (in MiBs) of the dataset (document, multiple pages).
|
||||
2. The number of pages in the dataset.
|
||||
|
||||
It then calculates proportional timeout values based on the above, and keeps the
|
||||
large one. This way, we can handle several corner cases:
|
||||
|
||||
* Documents with lots of pages, but small file size.
|
||||
* Single images with large file size.
|
||||
"""
|
||||
# Do not have timeouts lower than 10 seconds, if the file size is small, since
|
||||
# we need to take into account the program's startup time as well.
|
||||
timeout = max(TIMEOUT_PER_MB * size, TIMEOUT_MIN)
|
||||
if pages:
|
||||
timeout = max(timeout, TIMEOUT_PER_PAGE * pages)
|
||||
return timeout
|
||||
|
||||
|
||||
def get_tessdata_dir() -> str:
|
||||
if running_on_qubes():
|
||||
return "/usr/share/tesseract/tessdata/"
|
||||
|
@ -76,7 +52,7 @@ class DangerzoneConverter:
|
|||
|
||||
@classmethod
|
||||
def _write_int(cls, num: int, file: TextIO = sys.stdout) -> None:
|
||||
cls._write_bytes(num.to_bytes(2, "big", signed=False), file=file)
|
||||
cls._write_bytes(num.to_bytes(INT_BYTES, "big", signed=False), file=file)
|
||||
|
||||
# ==== ASYNC METHODS ====
|
||||
# We run sync methods in async wrappers, because pure async methods are more difficult:
|
||||
|
@ -127,8 +103,6 @@ class DangerzoneConverter:
|
|||
args: List[str],
|
||||
*,
|
||||
error_message: str,
|
||||
timeout_message: Optional[str] = None,
|
||||
timeout: Optional[float] = None,
|
||||
stdout_callback: Optional[Callable] = None,
|
||||
stderr_callback: Optional[Callable] = None,
|
||||
) -> Tuple[bytes, bytes]:
|
||||
|
@ -138,7 +112,6 @@ class DangerzoneConverter:
|
|||
output in bytes.
|
||||
|
||||
:raises RuntimeError: if the process returns a non-zero exit status
|
||||
:raises TimeoutError: if the process times out
|
||||
"""
|
||||
# Start the provided command, and return a handle. The command will run in the
|
||||
# background.
|
||||
|
@ -164,12 +137,9 @@ class DangerzoneConverter:
|
|||
self.read_stream(proc.stderr, stderr_callback)
|
||||
)
|
||||
|
||||
# Wait until the command has finished, for a specific timeout. Then, verify that the
|
||||
# command has completed successfully. In any other case, raise an exception.
|
||||
try:
|
||||
ret = await asyncio.wait_for(proc.wait(), timeout=timeout)
|
||||
except asyncio.exceptions.TimeoutError:
|
||||
raise TimeoutError(timeout_message)
|
||||
# Wait until the command has finished. Then, verify that the command
|
||||
# has completed successfully. In any other case, raise an exception.
|
||||
ret = await proc.wait()
|
||||
if ret != 0:
|
||||
raise RuntimeError(error_message)
|
||||
|
||||
|
@ -179,15 +149,6 @@ class DangerzoneConverter:
|
|||
stderr = await stderr_task
|
||||
return (stdout, stderr)
|
||||
|
||||
def calculate_timeout(
|
||||
self, size: float, pages: Optional[float] = None
|
||||
) -> Optional[float]:
|
||||
"""Calculate the timeout for a command."""
|
||||
if not int(os.environ.get("ENABLE_TIMEOUTS", 1)):
|
||||
return None
|
||||
|
||||
return calculate_timeout(size, pages)
|
||||
|
||||
@abstractmethod
|
||||
async def convert(self) -> None:
|
||||
pass
|
||||
|
|
|
@ -49,7 +49,6 @@ class PixelsToPDF(DangerzoneConverter):
|
|||
# The first few operations happen on a per-page basis.
|
||||
page_size = len(untrusted_rgb_data)
|
||||
total_size += page_size
|
||||
timeout = self.calculate_timeout(page_size, 1)
|
||||
pixmap = fitz.Pixmap(
|
||||
fitz.Colorspace(fitz.CS_RGB), width, height, untrusted_rgb_data, False
|
||||
)
|
||||
|
@ -75,10 +74,6 @@ class PixelsToPDF(DangerzoneConverter):
|
|||
safe_doc.insert_pdf(fitz.open("pdf", page_pdf_bytes))
|
||||
self.percentage += percentage_per_page
|
||||
|
||||
# Next operations apply to the all the pages, so we need to recalculate the
|
||||
# timeout.
|
||||
timeout = self.calculate_timeout(total_size, num_pages)
|
||||
|
||||
self.percentage = 100.0
|
||||
self.update_progress("Safe PDF created")
|
||||
|
||||
|
@ -110,7 +105,7 @@ async def main() -> int:
|
|||
try:
|
||||
await converter.convert(ocr_lang)
|
||||
return 0
|
||||
except (RuntimeError, TimeoutError, ValueError) as e:
|
||||
except (RuntimeError, ValueError) as e:
|
||||
converter.update_progress(str(e), error=True)
|
||||
return 1
|
||||
|
||||
|
|
|
@ -96,12 +96,6 @@ class Application(QtWidgets.QApplication):
|
|||
@click.option(
|
||||
"--unsafe-dummy-conversion", "dummy_conversion", flag_value=True, hidden=True
|
||||
)
|
||||
@click.option(
|
||||
"--enable-timeouts / --disable-timeouts",
|
||||
default=True,
|
||||
show_default=True,
|
||||
help="Enable/Disable timeouts during document conversion",
|
||||
)
|
||||
@click.argument(
|
||||
"filenames",
|
||||
required=False,
|
||||
|
@ -111,9 +105,7 @@ class Application(QtWidgets.QApplication):
|
|||
)
|
||||
@click.version_option(version=get_version(), message="%(version)s")
|
||||
@errors.handle_document_errors
|
||||
def gui_main(
|
||||
dummy_conversion: bool, filenames: Optional[List[str]], enable_timeouts: bool
|
||||
) -> bool:
|
||||
def gui_main(dummy_conversion: bool, filenames: Optional[List[str]]) -> bool:
|
||||
setup_logging()
|
||||
|
||||
if platform.system() == "Darwin":
|
||||
|
@ -138,7 +130,7 @@ def gui_main(
|
|||
qubes = Qubes()
|
||||
dangerzone = DangerzoneGui(app, isolation_provider=qubes)
|
||||
else:
|
||||
container = Container(enable_timeouts=enable_timeouts)
|
||||
container = Container()
|
||||
dangerzone = DangerzoneGui(app, isolation_provider=container)
|
||||
|
||||
# Allow Ctrl-C to smoothly quit the program instead of throwing an exception
|
||||
|
|
|
@ -10,9 +10,9 @@ from typing import IO, Callable, Optional
|
|||
from colorama import Fore, Style
|
||||
|
||||
from ..conversion import errors
|
||||
from ..conversion.common import calculate_timeout
|
||||
from ..conversion.common import INT_BYTES
|
||||
from ..document import Document
|
||||
from ..util import Stopwatch, nonblocking_read, replace_control_chars
|
||||
from ..util import replace_control_chars
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -23,27 +23,26 @@ PIXELS_TO_PDF_LOG_START = "----- PIXELS TO PDF LOG START -----"
|
|||
PIXELS_TO_PDF_LOG_END = "----- PIXELS TO PDF LOG END -----"
|
||||
|
||||
|
||||
def read_bytes(f: IO[bytes], size: int, timeout: float, exact: bool = True) -> bytes:
|
||||
def read_bytes(f: IO[bytes], size: int, exact: bool = True) -> bytes:
|
||||
"""Read bytes from a file-like object."""
|
||||
buf = nonblocking_read(f, size, timeout)
|
||||
buf = f.read(size)
|
||||
if exact and len(buf) != size:
|
||||
raise errors.InterruptedConversionException()
|
||||
return buf
|
||||
|
||||
|
||||
def read_int(f: IO[bytes], timeout: float) -> int:
|
||||
def read_int(f: IO[bytes]) -> int:
|
||||
"""Read 2 bytes from a file-like object, and decode them as int."""
|
||||
untrusted_int = read_bytes(f, 2, timeout)
|
||||
if len(untrusted_int) != 2:
|
||||
untrusted_int = f.read(INT_BYTES)
|
||||
if len(untrusted_int) != INT_BYTES:
|
||||
raise errors.InterruptedConversionException()
|
||||
return int.from_bytes(untrusted_int, "big", signed=False)
|
||||
|
||||
|
||||
def read_debug_text(f: IO[bytes], size: int) -> str:
|
||||
"""Read arbitrarily long text (for debug purposes)"""
|
||||
timeout = calculate_timeout(size)
|
||||
untrusted_text = read_bytes(f, size, timeout, exact=False)
|
||||
return untrusted_text.decode("ascii", errors="replace")
|
||||
untrusted_text = f.read(size).decode("ascii", errors="replace")
|
||||
return replace_control_chars(untrusted_text)
|
||||
|
||||
|
||||
class IsolationProvider(ABC):
|
||||
|
@ -51,8 +50,6 @@ class IsolationProvider(ABC):
|
|||
Abstracts an isolation provider
|
||||
"""
|
||||
|
||||
STARTUP_TIME_SECONDS = 0 # The maximum time it takes a the provider to start up.
|
||||
|
||||
def __init__(self) -> None:
|
||||
if getattr(sys, "dangerzone_dev", False) == True:
|
||||
self.proc_stderr = subprocess.PIPE
|
||||
|
@ -105,28 +102,18 @@ class IsolationProvider(ABC):
|
|||
except BrokenPipeError as e:
|
||||
raise errors.ConverterProcException(p)
|
||||
|
||||
# Get file size (in MiB)
|
||||
size = os.path.getsize(document.input_filename) / 1024**2
|
||||
timeout = calculate_timeout(size) + self.STARTUP_TIME_SECONDS
|
||||
|
||||
assert p is not None
|
||||
assert p.stdout is not None
|
||||
os.set_blocking(p.stdout.fileno(), False)
|
||||
|
||||
n_pages = read_int(p.stdout, timeout)
|
||||
assert p.stdout
|
||||
n_pages = read_int(p.stdout)
|
||||
if n_pages == 0 or n_pages > errors.MAX_PAGES:
|
||||
raise errors.MaxPagesException()
|
||||
percentage_per_page = 50.0 / n_pages
|
||||
|
||||
timeout = calculate_timeout(size, n_pages)
|
||||
sw = Stopwatch(timeout)
|
||||
sw.start()
|
||||
for page in range(1, n_pages + 1):
|
||||
text = f"Converting page {page}/{n_pages} to pixels"
|
||||
self.print_progress(document, False, text, percentage)
|
||||
|
||||
width = read_int(p.stdout, timeout=sw.remaining)
|
||||
height = read_int(p.stdout, timeout=sw.remaining)
|
||||
width = read_int(p.stdout)
|
||||
height = read_int(p.stdout)
|
||||
if not (1 <= width <= errors.MAX_PAGE_WIDTH):
|
||||
raise errors.MaxPageWidthException()
|
||||
if not (1 <= height <= errors.MAX_PAGE_HEIGHT):
|
||||
|
@ -136,7 +123,6 @@ class IsolationProvider(ABC):
|
|||
untrusted_pixels = read_bytes(
|
||||
p.stdout,
|
||||
num_pixels,
|
||||
timeout=sw.remaining,
|
||||
)
|
||||
|
||||
# Wrapper code
|
||||
|
@ -157,8 +143,7 @@ class IsolationProvider(ABC):
|
|||
self.print_progress(document, False, text, percentage)
|
||||
|
||||
if getattr(sys, "dangerzone_dev", False):
|
||||
assert p.stderr is not None
|
||||
os.set_blocking(p.stderr.fileno(), False)
|
||||
assert p.stderr
|
||||
untrusted_log = read_debug_text(p.stderr, MAX_CONVERSION_LOG_CHARS)
|
||||
p.stderr.close()
|
||||
log.info(
|
||||
|
|
|
@ -33,11 +33,6 @@ class NoContainerTechException(Exception):
|
|||
class Container(IsolationProvider):
|
||||
# Name of the dangerzone container
|
||||
CONTAINER_NAME = "dangerzone.rocks/dangerzone"
|
||||
STARTUP_TIME_SECONDS = 5
|
||||
|
||||
def __init__(self, enable_timeouts: bool) -> None:
|
||||
self.enable_timeouts = 1 if enable_timeouts else 0
|
||||
super().__init__()
|
||||
|
||||
@staticmethod
|
||||
def get_runtime_name() -> str:
|
||||
|
@ -227,8 +222,6 @@ class Container(IsolationProvider):
|
|||
f"OCR={0 if ocr_lang is None else 1}",
|
||||
"-e",
|
||||
f"OCR_LANGUAGE={ocr_lang}",
|
||||
"-e",
|
||||
f"ENABLE_TIMEOUTS={self.enable_timeouts}",
|
||||
]
|
||||
|
||||
pixels_to_pdf_proc = self.exec_container(command, extra_args)
|
||||
|
@ -256,14 +249,10 @@ class Container(IsolationProvider):
|
|||
"-m",
|
||||
"dangerzone.conversion.doc_to_pixels",
|
||||
]
|
||||
extra_args = [
|
||||
"-e",
|
||||
f"ENABLE_TIMEOUTS={self.enable_timeouts}",
|
||||
]
|
||||
return self.exec_container(command, extra_args)
|
||||
return self.exec_container(command)
|
||||
|
||||
def get_max_parallel_conversions(self) -> int:
|
||||
# FIXME hardcoded 1 until timeouts are more limited and better handled
|
||||
# FIXME hardcoded 1 until length conversions are better handled
|
||||
# https://github.com/freedomofpress/dangerzone/issues/257
|
||||
return 1
|
||||
|
||||
|
|
|
@ -23,8 +23,6 @@ log = logging.getLogger(__name__)
|
|||
class Qubes(IsolationProvider):
|
||||
"""Uses a disposable qube for performing the conversion"""
|
||||
|
||||
STARTUP_TIME_SECONDS = 5 * 60 # 5 minutes
|
||||
|
||||
def install(self) -> bool:
|
||||
return True
|
||||
|
||||
|
@ -37,7 +35,7 @@ class Qubes(IsolationProvider):
|
|||
converter = PixelsToPDF(progress_callback=print_progress_wrapper)
|
||||
try:
|
||||
asyncio.run(converter.convert(ocr_lang, tempdir))
|
||||
except (RuntimeError, TimeoutError, ValueError) as e:
|
||||
except (RuntimeError, ValueError) as e:
|
||||
raise errors.UnexpectedConversionError(str(e))
|
||||
finally:
|
||||
if getattr(sys, "dangerzone_dev", False):
|
||||
|
|
|
@ -22,7 +22,7 @@ from .base import IsolationProviderTest
|
|||
|
||||
@pytest.fixture
|
||||
def provider() -> Container:
|
||||
return Container(enable_timeouts=False)
|
||||
return Container()
|
||||
|
||||
|
||||
class TestContainer(IsolationProviderTest):
|
||||
|
|
|
@ -49,7 +49,7 @@ def test_ocr_ommisions() -> None:
|
|||
installed_langs -= {"osd", "equ"}
|
||||
|
||||
# Grab the languages that Dangerzone offers to the user through the GUI/CLI.
|
||||
offered_langs = set(DangerzoneCore(Container(True)).ocr_languages.values())
|
||||
offered_langs = set(DangerzoneCore(Container()).ocr_languages.values())
|
||||
|
||||
# Ensure that both the installed languages and the ones we offer to the user are the
|
||||
# same.
|
||||
|
|
Loading…
Reference in a new issue