From f1d90c6fa94dbf8a20c93bb8fe6b728c8d400cd8 Mon Sep 17 00:00:00 2001 From: deeplow Date: Wed, 3 Jan 2024 12:04:27 +0000 Subject: [PATCH] Compress per page when not using OCR Make the compression happen per page when OCR is not enabled [1]. [1]: https://github.com/freedomofpress/dangerzone/pull/622#discussion_r1410986342 --- dangerzone/conversion/pixels_to_pdf.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index 2c3fd6d..f8c670a 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -58,19 +58,21 @@ class PixelsToPDF(DangerzoneConverter): self.update_progress( f"Converting page {page_num}/{num_pages} from pixels to searchable PDF" ) - ocr_pdf_bytes = pixmap.pdfocr_tobytes( + page_pdf_bytes = pixmap.pdfocr_tobytes( compress=True, language=ocr_lang, tessdata=get_tessdata_dir(), ) - ocr_pdf = fitz.open("pdf", ocr_pdf_bytes) - safe_doc.insert_pdf(ocr_pdf) + ocr_pdf = fitz.open("pdf", page_pdf_bytes) else: # Don't OCR self.update_progress( f"Converting page {page_num}/{num_pages} from pixels to PDF" ) - safe_doc.insert_file(pixmap) + page_doc = fitz.Document() + page_doc.insert_file(pixmap) + page_pdf_bytes = page_doc.tobytes(deflate_images=True) + safe_doc.insert_pdf(fitz.open("pdf", page_pdf_bytes)) self.percentage += percentage_per_page # Next operations apply to the all the pages, so we need to recalculate the