From 576cbd33820f45df3909d6ad1119502e370598ea Mon Sep 17 00:00:00 2001 From: deeplow Date: Mon, 4 Dec 2023 16:38:53 +0000 Subject: [PATCH] Fix DPI mismatch between doc2pixels and pixels2pdf The original document was larger in dimensions than the original one due to a mismatch in DPI settings. When converting documents to pixels we were setting the DPI to 150 pixels per inch. Then when converting back into a PDF we were using 70 DPI. This difference would result in an overall larger document in dimensions (though not necessarily in file size). Fixes #626 --- CHANGELOG.md | 3 +++ dangerzone/conversion/common.py | 1 + dangerzone/conversion/doc_to_pixels.py | 4 ++-- dangerzone/conversion/pixels_to_pdf.py | 3 ++- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d9ca256..f808968 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ since 0.4.1, and this project adheres to [Semantic Versioning](https://semver.or ## Unreleased +## Fixed +- Fix mismatched between between original document and converted one ([issue #626](https://github.com/freedomofpress/dangerzone/issues/)). This does not affect the quality of the final document. + ### Changed - Feature: Add support for HWP/HWPX files (Hancom Office) for macOS Apple Silicon devices ([issue #498](https://github.com/freedomofpress/dangerzone/issues/498), thanks to [@OctopusET](https://github.com/OctopusET)) diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py index 94f9580..5b75186 100644 --- a/dangerzone/conversion/common.py +++ b/dangerzone/conversion/common.py @@ -15,6 +15,7 @@ from typing import Callable, Dict, List, Optional, Tuple, Union TIMEOUT_PER_PAGE: float = 30 # (seconds) TIMEOUT_PER_MB: float = 30 # (seconds) TIMEOUT_MIN: float = 60 # (seconds) +DEFAULT_DPI = 150 # Pixels per inch def running_on_qubes() -> bool: diff --git a/dangerzone/conversion/doc_to_pixels.py b/dangerzone/conversion/doc_to_pixels.py index d3d351a..60157d7 100644 --- a/dangerzone/conversion/doc_to_pixels.py +++ b/dangerzone/conversion/doc_to_pixels.py @@ -19,7 +19,7 @@ import fitz import magic from . import errors -from .common import DangerzoneConverter, running_on_qubes +from .common import DEFAULT_DPI, DangerzoneConverter, running_on_qubes class DocumentToPixels(DangerzoneConverter): @@ -245,7 +245,7 @@ class DocumentToPixels(DangerzoneConverter): self.update_progress( f"Converting page {page_num}/{doc.page_count} to pixels" ) - pix = page.get_pixmap(dpi=150) + pix = page.get_pixmap(dpi=DEFAULT_DPI) rgb_buf = pix.samples_mv await self.write_page_width(pix.width, width_filename) await self.write_page_height(pix.height, height_filename) diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index 13516e0..d5732dc 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -15,7 +15,7 @@ from typing import Optional import fitz -from .common import DangerzoneConverter, get_tessdata_dir, running_on_qubes +from .common import DEFAULT_DPI, DangerzoneConverter, get_tessdata_dir, running_on_qubes class PixelsToPDF(DangerzoneConverter): @@ -52,6 +52,7 @@ class PixelsToPDF(DangerzoneConverter): pixmap = fitz.Pixmap( fitz.Colorspace(fitz.CS_RGB), width, height, untrusted_rgb_data, False ) + pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI) if ocr_lang: # OCR the document self.update_progress( f"Converting page {page_num}/{num_pages} from pixels to searchable PDF"