mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-05-04 20:51:49 +02:00
Save RGB as PNG immediately to save space
Storing all RGB files in the host were leading to a fast-filling `/tmp`. This solution essentially converts all the RGB files to PNGs (which are compressed) saving valuable space in the process. This conversion is made with the Pillow (PIL) module, without the need for any external dependencies. Fixes #526
This commit is contained in:
parent
45a71224cb
commit
c15cbd65c6
5 changed files with 66 additions and 55 deletions
|
@ -104,6 +104,11 @@ class PDFtoPPMInvalidDepth(PDFtoPPMException):
|
||||||
error_message = "Error converting PDF to Pixels (Invalid PPM depth)"
|
error_message = "Error converting PDF to Pixels (Invalid PPM depth)"
|
||||||
|
|
||||||
|
|
||||||
|
class PPMtoPNGError(ConversionException):
|
||||||
|
error_code = ERROR_SHIFT + 55
|
||||||
|
error_message = "Document page could not be reassembled from individual pixels"
|
||||||
|
|
||||||
|
|
||||||
class InterruptedConversion(ConversionException):
|
class InterruptedConversion(ConversionException):
|
||||||
"""Protocol received num of bytes different than expected"""
|
"""Protocol received num of bytes different than expected"""
|
||||||
|
|
||||||
|
|
|
@ -18,33 +18,22 @@ from .common import DangerzoneConverter, running_on_qubes
|
||||||
|
|
||||||
class PixelsToPDF(DangerzoneConverter):
|
class PixelsToPDF(DangerzoneConverter):
|
||||||
async def convert(
|
async def convert(
|
||||||
self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = None
|
self, ocr_lang: Optional[str] = None, tempdir: Optional[str] = "/tmp"
|
||||||
) -> None:
|
) -> None:
|
||||||
self.percentage = 50.0
|
self.percentage = 50.0
|
||||||
if tempdir is None:
|
|
||||||
tempdir = "/tmp"
|
|
||||||
|
|
||||||
num_pages = len(glob.glob(f"{tempdir}/dangerzone/page-*.rgb"))
|
num_pages = len(glob.glob(f"{tempdir}/page-*.png"))
|
||||||
total_size = 0.0
|
total_size = 0.0
|
||||||
|
|
||||||
# Convert RGB files to PDF files
|
# Convert RGB files to PDF files
|
||||||
percentage_per_page = 45.0 / num_pages
|
percentage_per_page = 45.0 / num_pages
|
||||||
for page in range(1, num_pages + 1):
|
for page in range(1, num_pages + 1):
|
||||||
filename_base = f"{tempdir}/dangerzone/page-{page}"
|
filename_base = f"{tempdir}/page-{page}"
|
||||||
rgb_filename = f"{filename_base}.rgb"
|
png_filename = f"{filename_base}.png"
|
||||||
width_filename = f"{filename_base}.width"
|
pdf_filename = f"{filename_base}.pdf"
|
||||||
height_filename = f"{filename_base}.height"
|
|
||||||
png_filename = f"{tempdir}/page-{page}.png"
|
|
||||||
ocr_filename = f"{tempdir}/page-{page}"
|
|
||||||
pdf_filename = f"{tempdir}/page-{page}.pdf"
|
|
||||||
|
|
||||||
with open(width_filename) as f:
|
|
||||||
width = f.read().strip()
|
|
||||||
with open(height_filename) as f:
|
|
||||||
height = f.read().strip()
|
|
||||||
|
|
||||||
# The first few operations happen on a per-page basis.
|
# The first few operations happen on a per-page basis.
|
||||||
page_size = os.path.getsize(filename_base + ".rgb") / 1024**2
|
page_size = os.path.getsize(png_filename) / 1024**2
|
||||||
total_size += page_size
|
total_size += page_size
|
||||||
timeout = self.calculate_timeout(page_size, 1)
|
timeout = self.calculate_timeout(page_size, 1)
|
||||||
|
|
||||||
|
@ -52,29 +41,11 @@ class PixelsToPDF(DangerzoneConverter):
|
||||||
self.update_progress(
|
self.update_progress(
|
||||||
f"Converting page {page}/{num_pages} from pixels to searchable PDF"
|
f"Converting page {page}/{num_pages} from pixels to searchable PDF"
|
||||||
)
|
)
|
||||||
await self.run_command(
|
|
||||||
[
|
|
||||||
"gm",
|
|
||||||
"convert",
|
|
||||||
"-size",
|
|
||||||
f"{width}x{height}",
|
|
||||||
"-depth",
|
|
||||||
"8",
|
|
||||||
f"rgb:{rgb_filename}",
|
|
||||||
f"png:{png_filename}",
|
|
||||||
],
|
|
||||||
error_message=f"Page {page}/{num_pages} conversion to PNG failed",
|
|
||||||
timeout_message=(
|
|
||||||
"Error converting pixels to PNG, convert timed out after"
|
|
||||||
f" {timeout} seconds"
|
|
||||||
),
|
|
||||||
timeout=timeout,
|
|
||||||
)
|
|
||||||
await self.run_command(
|
await self.run_command(
|
||||||
[
|
[
|
||||||
"tesseract",
|
"tesseract",
|
||||||
png_filename,
|
png_filename,
|
||||||
ocr_filename,
|
filename_base,
|
||||||
"-l",
|
"-l",
|
||||||
ocr_lang,
|
ocr_lang,
|
||||||
"--dpi",
|
"--dpi",
|
||||||
|
@ -97,11 +68,7 @@ class PixelsToPDF(DangerzoneConverter):
|
||||||
[
|
[
|
||||||
"gm",
|
"gm",
|
||||||
"convert",
|
"convert",
|
||||||
"-size",
|
f"png:{png_filename}",
|
||||||
f"{width}x{height}",
|
|
||||||
"-depth",
|
|
||||||
"8",
|
|
||||||
f"rgb:{rgb_filename}",
|
|
||||||
f"pdf:{pdf_filename}",
|
f"pdf:{pdf_filename}",
|
||||||
],
|
],
|
||||||
error_message=f"Page {page}/{num_pages} conversion to PDF failed",
|
error_message=f"Page {page}/{num_pages} conversion to PDF failed",
|
||||||
|
@ -112,6 +79,9 @@ class PixelsToPDF(DangerzoneConverter):
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# remove PNG file when it is no longer needed
|
||||||
|
os.remove(png_filename)
|
||||||
|
|
||||||
self.percentage += percentage_per_page
|
self.percentage += percentage_per_page
|
||||||
|
|
||||||
# Next operations apply to the all the pages, so we need to recalculate the
|
# Next operations apply to the all the pages, so we need to recalculate the
|
||||||
|
@ -165,7 +135,7 @@ async def main() -> int:
|
||||||
converter = PixelsToPDF()
|
converter = PixelsToPDF()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await converter.convert(ocr_lang)
|
await converter.convert(ocr_lang, tempdir="/tmp/dangerzone")
|
||||||
error_code = 0 # Success!
|
error_code = 0 # Success!
|
||||||
|
|
||||||
except (RuntimeError, TimeoutError, ValueError) as e:
|
except (RuntimeError, TimeoutError, ValueError) as e:
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
|
import io
|
||||||
import logging
|
import logging
|
||||||
import subprocess
|
import subprocess
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
from pathlib import Path
|
||||||
from typing import Callable, Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from colorama import Fore, Style
|
from colorama import Fore, Style
|
||||||
|
from PIL import Image, UnidentifiedImageError
|
||||||
|
|
||||||
from ..conversion.errors import ConversionException
|
from ..conversion import errors
|
||||||
from ..document import Document
|
from ..document import Document
|
||||||
from ..util import replace_control_chars
|
from ..util import replace_control_chars
|
||||||
|
|
||||||
|
@ -37,7 +40,7 @@ class IsolationProvider(ABC):
|
||||||
document.mark_as_converting()
|
document.mark_as_converting()
|
||||||
try:
|
try:
|
||||||
success = self._convert(document, ocr_lang)
|
success = self._convert(document, ocr_lang)
|
||||||
except ConversionException as e:
|
except errors.ConversionException as e:
|
||||||
success = False
|
success = False
|
||||||
self.print_progress_trusted(document, True, str(e), 0)
|
self.print_progress_trusted(document, True, str(e), 0)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -101,6 +104,21 @@ class IsolationProvider(ABC):
|
||||||
armor_end = DOC_TO_PIXELS_LOG_END
|
armor_end = DOC_TO_PIXELS_LOG_END
|
||||||
return armor_start + conversion_string + armor_end
|
return armor_start + conversion_string + armor_end
|
||||||
|
|
||||||
|
def convert_pixels_to_png(
|
||||||
|
self, tempdir: str, page: int, width: int, height: int, rgb_data: bytes
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Reconstruct PPM files and save as PNG to save space
|
||||||
|
"""
|
||||||
|
ppm_header = f"P6\n{width} {height}\n255\n".encode()
|
||||||
|
ppm_data = io.BytesIO(ppm_header + rgb_data)
|
||||||
|
png_path = Path(tempdir) / f"page-{page}.png"
|
||||||
|
|
||||||
|
try:
|
||||||
|
Image.open(ppm_data).save(png_path, "PNG")
|
||||||
|
except UnidentifiedImageError as e:
|
||||||
|
raise errors.PPMtoPNGError() from e
|
||||||
|
|
||||||
|
|
||||||
# From global_common:
|
# From global_common:
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import glob
|
||||||
import gzip
|
import gzip
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
@ -11,7 +12,7 @@ import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Any, Callable, List, Optional, Tuple
|
from typing import Any, Callable, List, Optional, Tuple
|
||||||
|
|
||||||
from ..conversion.errors import exception_from_error_code
|
from ..conversion import errors
|
||||||
from ..document import Document
|
from ..document import Document
|
||||||
from ..util import (
|
from ..util import (
|
||||||
get_resource_path,
|
get_resource_path,
|
||||||
|
@ -304,11 +305,31 @@ class Container(IsolationProvider):
|
||||||
f"Conversion output (doc to pixels):\n{self.sanitize_conversion_str(untrusted_log)}"
|
f"Conversion output (doc to pixels):\n{self.sanitize_conversion_str(untrusted_log)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
num_pages = len(glob.glob(f"{pixel_dir}/page-*.rgb"))
|
||||||
|
for page in range(1, num_pages + 1):
|
||||||
|
filename_base = f"{pixel_dir}/page-{page}"
|
||||||
|
rgb_filename = f"{filename_base}.rgb"
|
||||||
|
width_filename = f"{filename_base}.width"
|
||||||
|
height_filename = f"{filename_base}.height"
|
||||||
|
with open(width_filename) as f:
|
||||||
|
width = int(f.read().strip())
|
||||||
|
with open(height_filename) as f:
|
||||||
|
height = int(f.read().strip())
|
||||||
|
with open(rgb_filename, "rb") as rgb_f:
|
||||||
|
untrusted_pixels = rgb_f.read()
|
||||||
|
self.convert_pixels_to_png(
|
||||||
|
str(pixel_dir), page, width, height, rgb_data=untrusted_pixels
|
||||||
|
)
|
||||||
|
|
||||||
|
os.remove(rgb_filename)
|
||||||
|
os.remove(width_filename)
|
||||||
|
os.remove(height_filename)
|
||||||
|
|
||||||
if ret != 0:
|
if ret != 0:
|
||||||
log.error("documents-to-pixels failed")
|
log.error("documents-to-pixels failed")
|
||||||
|
|
||||||
# XXX Reconstruct exception from error code
|
# XXX Reconstruct exception from error code
|
||||||
raise exception_from_error_code(ret) # type: ignore [misc]
|
raise errors.exception_from_error_code(ret) # type: ignore [misc]
|
||||||
else:
|
else:
|
||||||
# TODO: validate convert to pixels output
|
# TODO: validate convert to pixels output
|
||||||
|
|
||||||
|
|
|
@ -76,7 +76,6 @@ class Qubes(IsolationProvider):
|
||||||
) -> bool:
|
) -> bool:
|
||||||
success = False
|
success = False
|
||||||
|
|
||||||
Path(f"{tempdir}/dangerzone").mkdir()
|
|
||||||
percentage = 0.0
|
percentage = 0.0
|
||||||
|
|
||||||
with open(document.input_filename, "rb") as f:
|
with open(document.input_filename, "rb") as f:
|
||||||
|
@ -122,13 +121,9 @@ class Qubes(IsolationProvider):
|
||||||
timeout=sw.remaining,
|
timeout=sw.remaining,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Wrapper code
|
self.convert_pixels_to_png(
|
||||||
with open(f"{tempdir}/dangerzone/page-{page}.width", "w") as f_width:
|
tempdir, page, width, height, rgb_data=untrusted_pixels
|
||||||
f_width.write(str(width))
|
)
|
||||||
with open(f"{tempdir}/dangerzone/page-{page}.height", "w") as f_height:
|
|
||||||
f_height.write(str(height))
|
|
||||||
with open(f"{tempdir}/dangerzone/page-{page}.rgb", "wb") as f_rgb:
|
|
||||||
f_rgb.write(untrusted_pixels)
|
|
||||||
|
|
||||||
percentage += percentage_per_page
|
percentage += percentage_per_page
|
||||||
|
|
||||||
|
@ -165,7 +160,9 @@ class Qubes(IsolationProvider):
|
||||||
)
|
)
|
||||||
log.info(text)
|
log.info(text)
|
||||||
|
|
||||||
shutil.move(f"{tempdir}/safe-output-compressed.pdf", document.output_filename)
|
shutil.move(
|
||||||
|
Path(tempdir) / "safe-output-compressed.pdf", document.output_filename
|
||||||
|
)
|
||||||
success = True
|
success = True
|
||||||
|
|
||||||
return success
|
return success
|
||||||
|
|
Loading…
Reference in a new issue