Save RGB as PNG immediately to save space

Storing all RGB files in the host were leading to a fast-filling `/tmp`.
This solution essentially converts all the RGB files to PNGs (which are
compressed) saving valuable space in the process. This conversion is
made with the Pillow (PIL) module, without the need for any external
dependencies.

Fixes #526
This commit is contained in:
deeplow 2023-10-31 13:20:43 +00:00
parent 45a71224cb
commit c15cbd65c6
No known key found for this signature in database
GPG key ID: 577982871529A52A
5 changed files with 66 additions and 55 deletions

View file

@ -104,6 +104,11 @@ class PDFtoPPMInvalidDepth(PDFtoPPMException):
error_message = "Error converting PDF to Pixels (Invalid PPM depth)"
class PPMtoPNGError(ConversionException):
error_code = ERROR_SHIFT + 55
error_message = "Document page could not be reassembled from individual pixels"
class InterruptedConversion(ConversionException):
"""Protocol received num of bytes different than expected"""

View file

@ -18,33 +18,22 @@ from .common import DangerzoneConverter, running_on_qubes
class PixelsToPDF(DangerzoneConverter):
async def convert(
self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = None
self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = "/tmp"
) -> None:
self.percentage = 50.0
if tempdir is None:
tempdir = "/tmp"
num_pages = len(glob.glob(f"{tempdir}/dangerzone/page-*.rgb"))
num_pages = len(glob.glob(f"{tempdir}/page-*.png"))
total_size = 0.0
# Convert RGB files to PDF files
percentage_per_page = 45.0 / num_pages
for page in range(1, num_pages + 1):
filename_base = f"{tempdir}/dangerzone/page-{page}"
rgb_filename = f"{filename_base}.rgb"
width_filename = f"{filename_base}.width"
height_filename = f"{filename_base}.height"
png_filename = f"{tempdir}/page-{page}.png"
ocr_filename = f"{tempdir}/page-{page}"
pdf_filename = f"{tempdir}/page-{page}.pdf"
with open(width_filename) as f:
width = f.read().strip()
with open(height_filename) as f:
height = f.read().strip()
filename_base = f"{tempdir}/page-{page}"
png_filename = f"{filename_base}.png"
pdf_filename = f"{filename_base}.pdf"
# The first few operations happen on a per-page basis.
page_size = os.path.getsize(filename_base + ".rgb") / 1024**2
page_size = os.path.getsize(png_filename) / 1024**2
total_size += page_size
timeout = self.calculate_timeout(page_size, 1)
@ -52,29 +41,11 @@ class PixelsToPDF(DangerzoneConverter):
self.update_progress(
f"Converting page {page}/{num_pages} from pixels to searchable PDF"
)
await self.run_command(
[
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"png:{png_filename}",
],
error_message=f"Page {page}/{num_pages} conversion to PNG failed",
timeout_message=(
"Error converting pixels to PNG, convert timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)
await self.run_command(
[
"tesseract",
png_filename,
ocr_filename,
filename_base,
"-l",
ocr_lang,
"--dpi",
@ -97,11 +68,7 @@ class PixelsToPDF(DangerzoneConverter):
[
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"png:{png_filename}",
f"pdf:{pdf_filename}",
],
error_message=f"Page {page}/{num_pages} conversion to PDF failed",
@ -112,6 +79,9 @@ class PixelsToPDF(DangerzoneConverter):
timeout=timeout,
)
# remove PNG file when it is no longer needed
os.remove(png_filename)
self.percentage += percentage_per_page
# Next operations apply to the all the pages, so we need to recalculate the
@ -165,7 +135,7 @@ async def main() -> int:
converter = PixelsToPDF()
try:
await converter.convert(ocr_lang)
await converter.convert(ocr_lang, tempdir="/tmp/dangerzone")
error_code = 0 # Success!
except (RuntimeError, TimeoutError, ValueError) as e:

View file

@ -1,11 +1,14 @@
import io
import logging
import subprocess
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Callable, Optional
from colorama import Fore, Style
from PIL import Image, UnidentifiedImageError
from ..conversion.errors import ConversionException
from ..conversion import errors
from ..document import Document
from ..util import replace_control_chars
@ -37,7 +40,7 @@ class IsolationProvider(ABC):
document.mark_as_converting()
try:
success = self._convert(document, ocr_lang)
except ConversionException as e:
except errors.ConversionException as e:
success = False
self.print_progress_trusted(document, True, str(e), 0)
except Exception as e:
@ -101,6 +104,21 @@ class IsolationProvider(ABC):
armor_end = DOC_TO_PIXELS_LOG_END
return armor_start + conversion_string + armor_end
def convert_pixels_to_png(
self, tempdir: str, page: int, width: int, height: int, rgb_data: bytes
) -> None:
"""
Reconstruct PPM files and save as PNG to save space
"""
ppm_header = f"P6\n{width} {height}\n255\n".encode()
ppm_data = io.BytesIO(ppm_header + rgb_data)
png_path = Path(tempdir) / f"page-{page}.png"
try:
Image.open(ppm_data).save(png_path, "PNG")
except UnidentifiedImageError as e:
raise errors.PPMtoPNGError() from e
# From global_common:

View file

@ -1,3 +1,4 @@
import glob
import gzip
import json
import logging
@ -11,7 +12,7 @@ import sys
import tempfile
from typing import Any, Callable, List, Optional, Tuple
from ..conversion.errors import exception_from_error_code
from ..conversion import errors
from ..document import Document
from ..util import (
get_resource_path,
@ -304,11 +305,31 @@ class Container(IsolationProvider):
f"Conversion output (doc to pixels):\n{self.sanitize_conversion_str(untrusted_log)}"
)
num_pages = len(glob.glob(f"{pixel_dir}/page-*.rgb"))
for page in range(1, num_pages + 1):
filename_base = f"{pixel_dir}/page-{page}"
rgb_filename = f"{filename_base}.rgb"
width_filename = f"{filename_base}.width"
height_filename = f"{filename_base}.height"
with open(width_filename) as f:
width = int(f.read().strip())
with open(height_filename) as f:
height = int(f.read().strip())
with open(rgb_filename, "rb") as rgb_f:
untrusted_pixels = rgb_f.read()
self.convert_pixels_to_png(
str(pixel_dir), page, width, height, rgb_data=untrusted_pixels
)
os.remove(rgb_filename)
os.remove(width_filename)
os.remove(height_filename)
if ret != 0:
log.error("documents-to-pixels failed")
# XXX Reconstruct exception from error code
raise exception_from_error_code(ret) # type: ignore [misc]
raise errors.exception_from_error_code(ret) # type: ignore [misc]
else:
# TODO: validate convert to pixels output

View file

@ -76,7 +76,6 @@ class Qubes(IsolationProvider):
) -> bool:
success = False
Path(f"{tempdir}/dangerzone").mkdir()
percentage = 0.0
with open(document.input_filename, "rb") as f:
@ -122,13 +121,9 @@ class Qubes(IsolationProvider):
timeout=sw.remaining,
)
# Wrapper code
with open(f"{tempdir}/dangerzone/page-{page}.width", "w") as f_width:
f_width.write(str(width))
with open(f"{tempdir}/dangerzone/page-{page}.height", "w") as f_height:
f_height.write(str(height))
with open(f"{tempdir}/dangerzone/page-{page}.rgb", "wb") as f_rgb:
f_rgb.write(untrusted_pixels)
self.convert_pixels_to_png(
tempdir, page, width, height, rgb_data=untrusted_pixels
)
percentage += percentage_per_page
@ -165,7 +160,9 @@ class Qubes(IsolationProvider):
)
log.info(text)
shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename)
shutil.move(
Path(tempdir) / "safe-output-compressed.pdf", document.output_filename
)
success = True
return success