mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-29 10:12:38 +02:00
Parallel cli bulk conversions via threading
Initial parallel document conversion: creates a pool of N threads defined by the setting 'parallel_conversions'. Each thread calls convert() on a document.
This commit is contained in:
parent
e17912888a
commit
2d587f4082
2 changed files with 29 additions and 5 deletions
|
@ -254,6 +254,27 @@ def convert(
|
|||
return success
|
||||
|
||||
|
||||
def get_max_parallel_conversions() -> int:
|
||||
n_cpu = 1
|
||||
if platform.system() == "Linux":
|
||||
# if on linux containers run natively
|
||||
cpu_count = os.cpu_count()
|
||||
if cpu_count is not None:
|
||||
n_cpu = cpu_count
|
||||
|
||||
elif get_runtime_name() == "docker":
|
||||
# For Windows and MacOS containers run in VM
|
||||
# So we obtain the CPU count for the VM
|
||||
n_cpu_str = subprocess.check_output(
|
||||
[get_runtime(), "info", "--format", "{{.NCPU}}"],
|
||||
text=True,
|
||||
startupinfo=get_subprocess_startupinfo(),
|
||||
)
|
||||
n_cpu = int(n_cpu_str.strip())
|
||||
|
||||
return 2 * n_cpu + 1
|
||||
|
||||
|
||||
# From global_common:
|
||||
|
||||
# def validate_convert_to_pixel_output(self, common, output):
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import concurrent.futures
|
||||
import gzip
|
||||
import json
|
||||
import logging
|
||||
|
@ -11,7 +12,7 @@ from typing import Callable, List, Optional
|
|||
import appdirs
|
||||
import colorama
|
||||
|
||||
from .container import convert
|
||||
from . import container
|
||||
from .document import Document
|
||||
from .settings import Settings
|
||||
from .util import get_resource_path
|
||||
|
@ -49,10 +50,8 @@ class DangerzoneCore(object):
|
|||
def convert_documents(
|
||||
self, ocr_lang: Optional[str], stdout_callback: Callable[[str], None]
|
||||
) -> None:
|
||||
all_successful = True
|
||||
|
||||
for document in self.documents:
|
||||
success = convert(
|
||||
def convert_doc(document: Document) -> None:
|
||||
success = container.convert(
|
||||
document.input_filename,
|
||||
document.output_filename,
|
||||
ocr_lang,
|
||||
|
@ -63,6 +62,10 @@ class DangerzoneCore(object):
|
|||
else:
|
||||
document.mark_as_failed()
|
||||
|
||||
max_jobs = container.get_max_parallel_conversions()
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_jobs) as executor:
|
||||
executor.map(convert_doc, self.documents)
|
||||
|
||||
def get_safe_documents(self) -> List[Document]:
|
||||
return [doc for doc in self.documents if doc.is_safe()]
|
||||
|
||||
|
|
Loading…
Reference in a new issue