Fix DPI mismatch between doc2pixels and pixels2pdf

The original document was larger in dimensions than the original one due to a mismatch in DPI settings. When converting documents to pixels we were setting the DPI to 150 pixels per inch. Then when converting back into a PDF we were using 70 DPI. This difference would result in an overall larger document in dimensions (though not necessarily in file size). Fixes #626
2025-04-28 18:02:38 +02:00 · 2023-12-04 16:38:53 +00:00 · 2023-12-04 16:38:53 +00:00 · 576cbd3382
commit 576cbd3382
parent e5dbe25abb
4 changed files with 8 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,9 @@ since 0.4.1, and this project adheres to [Semantic Versioning](https://semver.or

 ## Unreleased

+## Fixed
+- Fix mismatched between between original document and converted one ([issue #626](https://github.com/freedomofpress/dangerzone/issues/)). This does not affect the quality of the final document.
+
 ### Changed

 - Feature: Add support for HWP/HWPX files (Hancom Office) for macOS Apple Silicon devices ([issue #498](https://github.com/freedomofpress/dangerzone/issues/498), thanks to [@OctopusET](https://github.com/OctopusET))
--- a/dangerzone/conversion/common.py
+++ b/dangerzone/conversion/common.py
@ -15,6 +15,7 @@ from typing import Callable, Dict, List, Optional, Tuple, Union
 TIMEOUT_PER_PAGE: float = 30  # (seconds)
 TIMEOUT_PER_MB: float = 30  # (seconds)
 TIMEOUT_MIN: float = 60  # (seconds)
+DEFAULT_DPI = 150  # Pixels per inch


 def running_on_qubes() -> bool:
--- a/dangerzone/conversion/doc_to_pixels.py
+++ b/dangerzone/conversion/doc_to_pixels.py
@ -19,7 +19,7 @@ import fitz
 import magic

 from . import errors
-from .common import DangerzoneConverter, running_on_qubes
+from .common import DEFAULT_DPI, DangerzoneConverter, running_on_qubes


 class DocumentToPixels(DangerzoneConverter):
@ -245,7 +245,7 @@ class DocumentToPixels(DangerzoneConverter):
            self.update_progress(
                f"Converting page {page_num}/{doc.page_count} to pixels"
            )
-            pix = page.get_pixmap(dpi=150)
+            pix = page.get_pixmap(dpi=DEFAULT_DPI)
            rgb_buf = pix.samples_mv
            await self.write_page_width(pix.width, width_filename)
            await self.write_page_height(pix.height, height_filename)
--- a/dangerzone/conversion/pixels_to_pdf.py
+++ b/dangerzone/conversion/pixels_to_pdf.py
@ -15,7 +15,7 @@ from typing import Optional

 import fitz

-from .common import DangerzoneConverter, get_tessdata_dir, running_on_qubes
+from .common import DEFAULT_DPI, DangerzoneConverter, get_tessdata_dir, running_on_qubes


 class PixelsToPDF(DangerzoneConverter):
@ -52,6 +52,7 @@ class PixelsToPDF(DangerzoneConverter):
            pixmap = fitz.Pixmap(
                fitz.Colorspace(fitz.CS_RGB), width, height, untrusted_rgb_data, False
            )
+            pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI)
            if ocr_lang:  # OCR the document
                self.update_progress(
                    f"Converting page {page_num}/{num_pages} from pixels to searchable PDF"