Save RGB as PNG immediately to save space

Storing all RGB files in the host were leading to a fast-filling `/tmp`.
This solution essentially converts all the RGB files to PNGs (which are
compressed) saving valuable space in the process. This conversion is
made with the Pillow (PIL) module, without the need for any external
dependencies.

Fixes #526
This commit is contained in:
deeplow 2023-10-31 13:20:43 +00:00
parent 45a71224cb
commit c15cbd65c6
No known key found for this signature in database
GPG key ID: 577982871529A52A
5 changed files with 66 additions and 55 deletions

View file

@ -104,6 +104,11 @@ class PDFtoPPMInvalidDepth(PDFtoPPMException):
error_message = "Error converting PDF to Pixels (Invalid PPM depth)" error_message = "Error converting PDF to Pixels (Invalid PPM depth)"
class PPMtoPNGError(ConversionException):
error_code = ERROR_SHIFT + 55
error_message = "Document page could not be reassembled from individual pixels"
class InterruptedConversion(ConversionException): class InterruptedConversion(ConversionException):
"""Protocol received num of bytes different than expected""" """Protocol received num of bytes different than expected"""

View file

@ -18,33 +18,22 @@ from .common import DangerzoneConverter, running_on_qubes
class PixelsToPDF(DangerzoneConverter): class PixelsToPDF(DangerzoneConverter):
async def convert( async def convert(
self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = None self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = "/tmp"
) -> None: ) -> None:
self.percentage = 50.0 self.percentage = 50.0
if tempdir is None:
tempdir = "/tmp"
num_pages = len(glob.glob(f"{tempdir}/dangerzone/page-*.rgb")) num_pages = len(glob.glob(f"{tempdir}/page-*.png"))
total_size = 0.0 total_size = 0.0
# Convert RGB files to PDF files # Convert RGB files to PDF files
percentage_per_page = 45.0 / num_pages percentage_per_page = 45.0 / num_pages
for page in range(1, num_pages + 1): for page in range(1, num_pages + 1):
filename_base = f"{tempdir}/dangerzone/page-{page}" filename_base = f"{tempdir}/page-{page}"
rgb_filename = f"{filename_base}.rgb" png_filename = f"{filename_base}.png"
width_filename = f"{filename_base}.width" pdf_filename = f"{filename_base}.pdf"
height_filename = f"{filename_base}.height"
png_filename = f"{tempdir}/page-{page}.png"
ocr_filename = f"{tempdir}/page-{page}"
pdf_filename = f"{tempdir}/page-{page}.pdf"
with open(width_filename) as f:
width = f.read().strip()
with open(height_filename) as f:
height = f.read().strip()
# The first few operations happen on a per-page basis. # The first few operations happen on a per-page basis.
page_size = os.path.getsize(filename_base + ".rgb") / 1024**2 page_size = os.path.getsize(png_filename) / 1024**2
total_size += page_size total_size += page_size
timeout = self.calculate_timeout(page_size, 1) timeout = self.calculate_timeout(page_size, 1)
@ -52,29 +41,11 @@ class PixelsToPDF(DangerzoneConverter):
self.update_progress( self.update_progress(
f"Converting page {page}/{num_pages} from pixels to searchable PDF" f"Converting page {page}/{num_pages} from pixels to searchable PDF"
) )
await self.run_command(
[
"gm",
"convert",
"-size",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"png:{png_filename}",
],
error_message=f"Page {page}/{num_pages} conversion to PNG failed",
timeout_message=(
"Error converting pixels to PNG, convert timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)
await self.run_command( await self.run_command(
[ [
"tesseract", "tesseract",
png_filename, png_filename,
ocr_filename, filename_base,
"-l", "-l",
ocr_lang, ocr_lang,
"--dpi", "--dpi",
@ -97,11 +68,7 @@ class PixelsToPDF(DangerzoneConverter):
[ [
"gm", "gm",
"convert", "convert",
"-size", f"png:{png_filename}",
f"{width}x{height}",
"-depth",
"8",
f"rgb:{rgb_filename}",
f"pdf:{pdf_filename}", f"pdf:{pdf_filename}",
], ],
error_message=f"Page {page}/{num_pages} conversion to PDF failed", error_message=f"Page {page}/{num_pages} conversion to PDF failed",
@ -112,6 +79,9 @@ class PixelsToPDF(DangerzoneConverter):
timeout=timeout, timeout=timeout,
) )
# remove PNG file when it is no longer needed
os.remove(png_filename)
self.percentage += percentage_per_page self.percentage += percentage_per_page
# Next operations apply to the all the pages, so we need to recalculate the # Next operations apply to the all the pages, so we need to recalculate the
@ -165,7 +135,7 @@ async def main() -> int:
converter = PixelsToPDF() converter = PixelsToPDF()
try: try:
await converter.convert(ocr_lang) await converter.convert(ocr_lang, tempdir="/tmp/dangerzone")
error_code = 0 # Success! error_code = 0 # Success!
except (RuntimeError, TimeoutError, ValueError) as e: except (RuntimeError, TimeoutError, ValueError) as e:

View file

@ -1,11 +1,14 @@
import io
import logging import logging
import subprocess import subprocess
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pathlib import Path
from typing import Callable, Optional from typing import Callable, Optional
from colorama import Fore, Style from colorama import Fore, Style
from PIL import Image, UnidentifiedImageError
from ..conversion.errors import ConversionException from ..conversion import errors
from ..document import Document from ..document import Document
from ..util import replace_control_chars from ..util import replace_control_chars
@ -37,7 +40,7 @@ class IsolationProvider(ABC):
document.mark_as_converting() document.mark_as_converting()
try: try:
success = self._convert(document, ocr_lang) success = self._convert(document, ocr_lang)
except ConversionException as e: except errors.ConversionException as e:
success = False success = False
self.print_progress_trusted(document, True, str(e), 0) self.print_progress_trusted(document, True, str(e), 0)
except Exception as e: except Exception as e:
@ -101,6 +104,21 @@ class IsolationProvider(ABC):
armor_end = DOC_TO_PIXELS_LOG_END armor_end = DOC_TO_PIXELS_LOG_END
return armor_start + conversion_string + armor_end return armor_start + conversion_string + armor_end
def convert_pixels_to_png(
self, tempdir: str, page: int, width: int, height: int, rgb_data: bytes
) -> None:
"""
Reconstruct PPM files and save as PNG to save space
"""
ppm_header = f"P6\n{width} {height}\n255\n".encode()
ppm_data = io.BytesIO(ppm_header + rgb_data)
png_path = Path(tempdir) / f"page-{page}.png"
try:
Image.open(ppm_data).save(png_path, "PNG")
except UnidentifiedImageError as e:
raise errors.PPMtoPNGError() from e
# From global_common: # From global_common:

View file

@ -1,3 +1,4 @@
import glob
import gzip import gzip
import json import json
import logging import logging
@ -11,7 +12,7 @@ import sys
import tempfile import tempfile
from typing import Any, Callable, List, Optional, Tuple from typing import Any, Callable, List, Optional, Tuple
from ..conversion.errors import exception_from_error_code from ..conversion import errors
from ..document import Document from ..document import Document
from ..util import ( from ..util import (
get_resource_path, get_resource_path,
@ -304,11 +305,31 @@ class Container(IsolationProvider):
f"Conversion output (doc to pixels):\n{self.sanitize_conversion_str(untrusted_log)}" f"Conversion output (doc to pixels):\n{self.sanitize_conversion_str(untrusted_log)}"
) )
num_pages = len(glob.glob(f"{pixel_dir}/page-*.rgb"))
for page in range(1, num_pages + 1):
filename_base = f"{pixel_dir}/page-{page}"
rgb_filename = f"{filename_base}.rgb"
width_filename = f"{filename_base}.width"
height_filename = f"{filename_base}.height"
with open(width_filename) as f:
width = int(f.read().strip())
with open(height_filename) as f:
height = int(f.read().strip())
with open(rgb_filename, "rb") as rgb_f:
untrusted_pixels = rgb_f.read()
self.convert_pixels_to_png(
str(pixel_dir), page, width, height, rgb_data=untrusted_pixels
)
os.remove(rgb_filename)
os.remove(width_filename)
os.remove(height_filename)
if ret != 0: if ret != 0:
log.error("documents-to-pixels failed") log.error("documents-to-pixels failed")
# XXX Reconstruct exception from error code # XXX Reconstruct exception from error code
raise exception_from_error_code(ret) # type: ignore [misc] raise errors.exception_from_error_code(ret) # type: ignore [misc]
else: else:
# TODO: validate convert to pixels output # TODO: validate convert to pixels output

View file

@ -76,7 +76,6 @@ class Qubes(IsolationProvider):
) -> bool: ) -> bool:
success = False success = False
Path(f"{tempdir}/dangerzone").mkdir()
percentage = 0.0 percentage = 0.0
with open(document.input_filename, "rb") as f: with open(document.input_filename, "rb") as f:
@ -122,13 +121,9 @@ class Qubes(IsolationProvider):
timeout=sw.remaining, timeout=sw.remaining,
) )
# Wrapper code self.convert_pixels_to_png(
with open(f"{tempdir}/dangerzone/page-{page}.width", "w") as f_width: tempdir, page, width, height, rgb_data=untrusted_pixels
f_width.write(str(width)) )
with open(f"{tempdir}/dangerzone/page-{page}.height", "w") as f_height:
f_height.write(str(height))
with open(f"{tempdir}/dangerzone/page-{page}.rgb", "wb") as f_rgb:
f_rgb.write(untrusted_pixels)
percentage += percentage_per_page percentage += percentage_per_page
@ -165,7 +160,9 @@ class Qubes(IsolationProvider):
) )
log.info(text) log.info(text)
shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename) shutil.move(
Path(tempdir) / "safe-output-compressed.pdf", document.output_filename
)
success = True success = True
return success return success