Make pdf to ppm conversion dependent on num pages

This commit is contained in:
deeplow 2023-01-23 13:20:15 +00:00
parent d28aa5a25b
commit 272d25aee0
No known key found for this signature in database
GPG key ID: 577982871529A52A

View file

@ -20,7 +20,7 @@ import shutil
import subprocess import subprocess
import sys import sys
import time import time
from typing import Callable, Dict, List, Optional from typing import Callable, Dict, List, Optional, Union
import magic import magic
@ -220,7 +220,24 @@ class DangerzoneConverter:
) )
self.percentage += 3 self.percentage += 3
self.update_progress("Obtaining PDF metadata") # Obtain number of pages
self.update_progress("Calculating number of pages")
self.num_pages: Union[None, int] = None
def get_num_pages(line: str) -> None:
search = re.search(r"^Pages: (\d+)", line)
if search is not None:
self.num_pages = int(search.group(1))
run_command(
["pdfinfo", pdf_filename],
error_message="PDF file is corrupted",
timeout_message=f"Extracting metadata from PDF timed out after 1 second",
timeout=1,
stdout_callback=get_num_pages,
)
if self.num_pages == None:
raise ValueError("Number of pages could not be extraced from PDF")
def pdftoppm_progress_callback(line: str) -> None: def pdftoppm_progress_callback(line: str) -> None:
"""Function called for every line the 'pdftoppm'command outputs """Function called for every line the 'pdftoppm'command outputs
@ -286,7 +303,9 @@ class DangerzoneConverter:
os.remove(ppm_filename) os.remove(ppm_filename)
page_base = "/tmp/page" page_base = "/tmp/page"
# Convert to PPM, which is essentially an RGB format # Convert to PPM, which is essentially an RGB format
pdftoppm_timeout = 1.0 * self.num_pages # type: ignore [operator]
run_command( run_command(
[ [
"pdftoppm", "pdftoppm",
@ -295,8 +314,9 @@ class DangerzoneConverter:
"-progress", "-progress",
], ],
error_message="Conversion from PDF to PPM failed", error_message="Conversion from PDF to PPM failed",
timeout_message=f"Error converting from PDF to PPM, pdftoppm timed out after {DEFAULT_TIMEOUT} seconds", timeout_message=f"Error converting from PDF to PPM, pdftoppm timed out after {pdftoppm_timeout} seconds",
stderr_callback=pdftoppm_progress_callback, stderr_callback=pdftoppm_progress_callback,
timeout=pdftoppm_timeout,
) )
self.update_progress("Converted document to pixels") self.update_progress("Converted document to pixels")