mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-29 18:22:37 +02:00
Move dangerzone-converter code into dangerzone repo
This commit is contained in:
parent
f4ff04ff80
commit
27bdcd408b
8 changed files with 640 additions and 0 deletions
108
dangerzone-converter/.circleci/config.yml
Normal file
108
dangerzone-converter/.circleci/config.yml
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
version: 2.1
|
||||||
|
|
||||||
|
executors:
|
||||||
|
docker-publisher:
|
||||||
|
environment:
|
||||||
|
IMAGE_NAME: flmcode/dangerzone
|
||||||
|
docker:
|
||||||
|
- image: circleci/python
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
|
||||||
|
build:
|
||||||
|
executor: docker-publisher
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- setup_remote_docker
|
||||||
|
- run:
|
||||||
|
name: Build docker image
|
||||||
|
command: docker build -t $IMAGE_NAME:latest .
|
||||||
|
- run:
|
||||||
|
name: Archive docker image
|
||||||
|
command: docker save -o image.tar $IMAGE_NAME
|
||||||
|
- persist_to_workspace:
|
||||||
|
root: .
|
||||||
|
paths:
|
||||||
|
- ./image.tar
|
||||||
|
|
||||||
|
publish-latest:
|
||||||
|
executor: docker-publisher
|
||||||
|
steps:
|
||||||
|
- attach_workspace:
|
||||||
|
at: /tmp/workspace
|
||||||
|
- setup_remote_docker
|
||||||
|
- run:
|
||||||
|
name: Load archived docker image
|
||||||
|
command: docker load -i /tmp/workspace/image.tar
|
||||||
|
- run:
|
||||||
|
name: Publish image to docker Hub
|
||||||
|
command: |
|
||||||
|
echo $DOCKERHUB_PASSWORD | docker login -u $DOCKERHUB_USERNAME --password-stdin
|
||||||
|
docker push $IMAGE_NAME:latest
|
||||||
|
|
||||||
|
publish-tag:
|
||||||
|
executor: docker-publisher
|
||||||
|
steps:
|
||||||
|
- attach_workspace:
|
||||||
|
at: /tmp/workspace
|
||||||
|
- setup_remote_docker
|
||||||
|
- run:
|
||||||
|
name: Load archived docker image
|
||||||
|
command: docker load -i /tmp/workspace/image.tar
|
||||||
|
- run:
|
||||||
|
name: Publish image to docker Hub
|
||||||
|
command: |
|
||||||
|
echo $DOCKERHUB_PASSWORD | docker login -u $DOCKERHUB_USERNAME --password-stdin
|
||||||
|
IMAGE_TAG=${CIRCLE_TAG/v/''}
|
||||||
|
docker tag $IMAGE_NAME:latest $IMAGE_NAME:$IMAGE_TAG
|
||||||
|
docker push $IMAGE_NAME:latest
|
||||||
|
docker push $IMAGE_NAME:$IMAGE_TAG
|
||||||
|
|
||||||
|
workflows:
|
||||||
|
version: 2
|
||||||
|
build-stable:
|
||||||
|
jobs:
|
||||||
|
- build:
|
||||||
|
filters:
|
||||||
|
branches:
|
||||||
|
only: stable
|
||||||
|
- publish-latest:
|
||||||
|
requires:
|
||||||
|
- build
|
||||||
|
filters:
|
||||||
|
branches:
|
||||||
|
only: stable
|
||||||
|
build-tags:
|
||||||
|
jobs:
|
||||||
|
- build:
|
||||||
|
filters:
|
||||||
|
tags:
|
||||||
|
only: /^v.*/
|
||||||
|
branches:
|
||||||
|
ignore: /.*/
|
||||||
|
- publish-tag:
|
||||||
|
requires:
|
||||||
|
- build
|
||||||
|
filters:
|
||||||
|
tags:
|
||||||
|
only: /^v.*/
|
||||||
|
branches:
|
||||||
|
ignore: /.*/
|
||||||
|
monthly:
|
||||||
|
triggers:
|
||||||
|
- schedule:
|
||||||
|
cron: "0 0 1 * *"
|
||||||
|
filters:
|
||||||
|
branches:
|
||||||
|
only: stable
|
||||||
|
jobs:
|
||||||
|
- build:
|
||||||
|
filters:
|
||||||
|
branches:
|
||||||
|
only: stable
|
||||||
|
- publish-latest:
|
||||||
|
requires:
|
||||||
|
- build
|
||||||
|
filters:
|
||||||
|
branches:
|
||||||
|
only: stable
|
100
dangerzone-converter/Dockerfile
Normal file
100
dangerzone-converter/Dockerfile
Normal file
|
@ -0,0 +1,100 @@
|
||||||
|
FROM alpine:latest
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN apk -U upgrade && \
|
||||||
|
apk add \
|
||||||
|
ghostscript \
|
||||||
|
graphicsmagick \
|
||||||
|
libreoffice \
|
||||||
|
openjdk8 \
|
||||||
|
poppler-utils \
|
||||||
|
py3-magic \
|
||||||
|
py3-pillow \
|
||||||
|
sudo \
|
||||||
|
tesseract-ocr \
|
||||||
|
tesseract-ocr-data-afr \
|
||||||
|
tesseract-ocr-data-ara \
|
||||||
|
tesseract-ocr-data-aze \
|
||||||
|
tesseract-ocr-data-bel \
|
||||||
|
tesseract-ocr-data-ben \
|
||||||
|
tesseract-ocr-data-bul \
|
||||||
|
tesseract-ocr-data-cat \
|
||||||
|
tesseract-ocr-data-ces \
|
||||||
|
tesseract-ocr-data-chi_sim \
|
||||||
|
tesseract-ocr-data-chi_tra \
|
||||||
|
tesseract-ocr-data-chr \
|
||||||
|
tesseract-ocr-data-dan \
|
||||||
|
tesseract-ocr-data-deu \
|
||||||
|
tesseract-ocr-data-ell \
|
||||||
|
tesseract-ocr-data-enm \
|
||||||
|
tesseract-ocr-data-epo \
|
||||||
|
tesseract-ocr-data-equ \
|
||||||
|
tesseract-ocr-data-est \
|
||||||
|
tesseract-ocr-data-eus \
|
||||||
|
tesseract-ocr-data-fin \
|
||||||
|
tesseract-ocr-data-fra \
|
||||||
|
tesseract-ocr-data-frk \
|
||||||
|
tesseract-ocr-data-frm \
|
||||||
|
tesseract-ocr-data-glg \
|
||||||
|
tesseract-ocr-data-grc \
|
||||||
|
tesseract-ocr-data-heb \
|
||||||
|
tesseract-ocr-data-hin \
|
||||||
|
tesseract-ocr-data-hrv \
|
||||||
|
tesseract-ocr-data-hun \
|
||||||
|
tesseract-ocr-data-ind \
|
||||||
|
tesseract-ocr-data-isl \
|
||||||
|
tesseract-ocr-data-ita \
|
||||||
|
tesseract-ocr-data-ita_old \
|
||||||
|
tesseract-ocr-data-jpn \
|
||||||
|
tesseract-ocr-data-kan \
|
||||||
|
tesseract-ocr-data-kat \
|
||||||
|
tesseract-ocr-data-kor \
|
||||||
|
tesseract-ocr-data-lav \
|
||||||
|
tesseract-ocr-data-lit \
|
||||||
|
tesseract-ocr-data-mal \
|
||||||
|
tesseract-ocr-data-mkd \
|
||||||
|
tesseract-ocr-data-mlt \
|
||||||
|
tesseract-ocr-data-msa \
|
||||||
|
tesseract-ocr-data-nld \
|
||||||
|
tesseract-ocr-data-nor \
|
||||||
|
tesseract-ocr-data-pol \
|
||||||
|
tesseract-ocr-data-por \
|
||||||
|
tesseract-ocr-data-ron \
|
||||||
|
tesseract-ocr-data-rus \
|
||||||
|
tesseract-ocr-data-slk \
|
||||||
|
tesseract-ocr-data-slv \
|
||||||
|
tesseract-ocr-data-spa \
|
||||||
|
tesseract-ocr-data-spa_old \
|
||||||
|
tesseract-ocr-data-sqi \
|
||||||
|
tesseract-ocr-data-srp \
|
||||||
|
tesseract-ocr-data-swa \
|
||||||
|
tesseract-ocr-data-swe \
|
||||||
|
tesseract-ocr-data-tam \
|
||||||
|
tesseract-ocr-data-tel \
|
||||||
|
tesseract-ocr-data-tgl \
|
||||||
|
tesseract-ocr-data-tha \
|
||||||
|
tesseract-ocr-data-tur \
|
||||||
|
tesseract-ocr-data-ukr \
|
||||||
|
tesseract-ocr-data-vie
|
||||||
|
|
||||||
|
# Install pdftk
|
||||||
|
RUN \
|
||||||
|
wget https://gitlab.com/pdftk-java/pdftk/-/jobs/924565145/artifacts/raw/build/libs/pdftk-all.jar && \
|
||||||
|
mv pdftk-all.jar /usr/local/bin && \
|
||||||
|
chmod +x /usr/local/bin/pdftk-all.jar && \
|
||||||
|
echo '#!/bin/sh' > /usr/local/bin/pdftk && \
|
||||||
|
echo '/usr/bin/java -jar "/usr/local/bin/pdftk-all.jar" "$@"' >> /usr/local/bin/pdftk && \
|
||||||
|
chmod +x /usr/local/bin/pdftk
|
||||||
|
|
||||||
|
COPY scripts/* /usr/local/bin/
|
||||||
|
|
||||||
|
# Add the unprivileged user
|
||||||
|
RUN adduser -h /home/user -s /bin/sh -D user
|
||||||
|
|
||||||
|
# /tmp/input_file is where the first convert expects the input file to be, and
|
||||||
|
# /tmp where it will write the pixel files
|
||||||
|
#
|
||||||
|
# /dangerzone is where the second script expects files to be put by the first one
|
||||||
|
#
|
||||||
|
# /safezone is where the wrapper eventually moves the sanitized files.
|
||||||
|
VOLUME /dangerzone /tmp/input_file /safezone
|
21
dangerzone-converter/LICENSE
Normal file
21
dangerzone-converter/LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2020 First Look Media
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
9
dangerzone-converter/README.md
Normal file
9
dangerzone-converter/README.md
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
# dangerzone-converter
|
||||||
|
|
||||||
|
This is the container for [dangerzone](https://github.com/firstlookmedia/dangerzone), which converts potentially a dangerous PDF, office document, or image into a safe PDF.
|
||||||
|
|
||||||
|
## Development notes
|
||||||
|
|
||||||
|
Issues are tracked [here](https://github.com/firstlookmedia/dangerzone/issues?q=is%3Aissue+is%3Aopen+label%3Acontainer), in the dangerzone repository using the `container` label.
|
||||||
|
|
||||||
|
Containers are built in continuous integration when commits are pushed to the `stable` branch.
|
27
dangerzone-converter/scripts/document-to-pixels
Executable file
27
dangerzone-converter/scripts/document-to-pixels
Executable file
|
@ -0,0 +1,27 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
# Remove this warning by setting the host in /etc/hosts:
|
||||||
|
# sudo: unable to resolve host 8160b021d811: Temporary failure in name resolution
|
||||||
|
echo 127.0.0.1 $(hostname) >> /etc/hosts
|
||||||
|
|
||||||
|
# Record original permissions, and make document readable
|
||||||
|
START_PERMISSIONS=$(stat /tmp/input_file | grep Access | grep Uid | cut -d"(" -f2 |cut -d"/" -f1)
|
||||||
|
/bin/chmod 0644 /tmp/input_file
|
||||||
|
|
||||||
|
# Do the conversion without root
|
||||||
|
/usr/bin/sudo -u user /usr/local/bin/document-to-pixels-unpriv
|
||||||
|
RETURN_CODE=$?
|
||||||
|
|
||||||
|
# Restore original permissions
|
||||||
|
/bin/chmod $START_PERMISSIONS /tmp/input_file
|
||||||
|
|
||||||
|
# Check for failure
|
||||||
|
if [ $RETURN_CODE -ne 0 ]; then
|
||||||
|
echo ""
|
||||||
|
exit $RETURN_CODE
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Move converted files into /dangerzone
|
||||||
|
/bin/mv /tmp/page-*.rgb /dangerzone
|
||||||
|
/bin/mv /tmp/page-*.width /dangerzone
|
||||||
|
/bin/mv /tmp/page-*.height /dangerzone
|
219
dangerzone-converter/scripts/document-to-pixels-unpriv
Executable file
219
dangerzone-converter/scripts/document-to-pixels-unpriv
Executable file
|
@ -0,0 +1,219 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import sys
|
||||||
|
import subprocess
|
||||||
|
import glob
|
||||||
|
import shutil
|
||||||
|
import os
|
||||||
|
|
||||||
|
import magic
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
|
def print_flush(s):
|
||||||
|
print(s)
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
conversions = {
|
||||||
|
# .pdf
|
||||||
|
"application/pdf": {"type": None},
|
||||||
|
# .docx
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": {
|
||||||
|
"type": "libreoffice",
|
||||||
|
"libreoffice_output_filter": "writer_pdf_Export",
|
||||||
|
},
|
||||||
|
# .doc
|
||||||
|
"application/msword": {
|
||||||
|
"type": "libreoffice",
|
||||||
|
"libreoffice_output_filter": "writer_pdf_Export",
|
||||||
|
},
|
||||||
|
# .docm
|
||||||
|
"application/vnd.ms-word.document.macroEnabled.12": {
|
||||||
|
"type": "libreoffice",
|
||||||
|
"libreoffice_output_filter": "writer_pdf_Export",
|
||||||
|
},
|
||||||
|
# .xlsx
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {
|
||||||
|
"type": "libreoffice",
|
||||||
|
"libreoffice_output_filter": "calc_pdf_Export",
|
||||||
|
},
|
||||||
|
# .xls
|
||||||
|
"application/vnd.ms-excel": {
|
||||||
|
"type": "libreoffice",
|
||||||
|
"libreoffice_output_filter": "calc_pdf_Export",
|
||||||
|
},
|
||||||
|
# .pptx
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation": {
|
||||||
|
"type": "libreoffice",
|
||||||
|
"libreoffice_output_filter": "impress_pdf_Export",
|
||||||
|
},
|
||||||
|
# .ppt
|
||||||
|
"application/vnd.ms-powerpoint": {
|
||||||
|
"type": "libreoffice",
|
||||||
|
"libreoffice_output_filter": "impress_pdf_Export",
|
||||||
|
},
|
||||||
|
# .odt
|
||||||
|
"application/vnd.oasis.opendocument.text": {
|
||||||
|
"type": "libreoffice",
|
||||||
|
"libreoffice_output_filter": "writer_pdf_Export",
|
||||||
|
},
|
||||||
|
# .odg
|
||||||
|
"application/vnd.oasis.opendocument.graphics": {
|
||||||
|
"type": "libreoffice",
|
||||||
|
"libreoffice_output_filter": "impress_pdf_Export",
|
||||||
|
},
|
||||||
|
# .odp
|
||||||
|
"application/vnd.oasis.opendocument.presentation": {
|
||||||
|
"type": "libreoffice",
|
||||||
|
"libreoffice_output_filter": "impress_pdf_Export",
|
||||||
|
},
|
||||||
|
# .ops
|
||||||
|
"application/vnd.oasis.opendocument.spreadsheet": {
|
||||||
|
"type": "libreoffice",
|
||||||
|
"libreoffice_output_filter": "calc_pdf_Export",
|
||||||
|
},
|
||||||
|
# .jpg
|
||||||
|
"image/jpeg": {"type": "convert"},
|
||||||
|
# .gif
|
||||||
|
"image/gif": {"type": "convert"},
|
||||||
|
# .png
|
||||||
|
"image/png": {"type": "convert"},
|
||||||
|
# .tif
|
||||||
|
"image/tiff": {"type": "convert"},
|
||||||
|
"image/x-tiff": {"type": "convert"},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Detect MIME type
|
||||||
|
mime = magic.Magic(mime=True)
|
||||||
|
mime_type = mime.from_file("/tmp/input_file")
|
||||||
|
|
||||||
|
# Validate MIME type
|
||||||
|
if mime_type not in conversions:
|
||||||
|
print_flush("The document format is not supported")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Convert input document to PDF
|
||||||
|
conversion = conversions[mime_type]
|
||||||
|
if conversion["type"] is None:
|
||||||
|
pdf_filename = "/tmp/input_file"
|
||||||
|
elif conversion["type"] == "libreoffice":
|
||||||
|
print_flush(f"Converting to PDF using LibreOffice")
|
||||||
|
args = [
|
||||||
|
"libreoffice",
|
||||||
|
"--headless",
|
||||||
|
"--convert-to",
|
||||||
|
f"pdf:{conversion['libreoffice_output_filter']}",
|
||||||
|
"--outdir",
|
||||||
|
"/tmp",
|
||||||
|
"/tmp/input_file",
|
||||||
|
]
|
||||||
|
try:
|
||||||
|
p = subprocess.run(args, timeout=60)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print_flush(
|
||||||
|
"Error converting document to PDF, LibreOffice timed out after 60 seconds"
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if p.returncode != 0:
|
||||||
|
print_flush(f"Conversion to PDF failed: {p.stdout}")
|
||||||
|
sys.exit(1)
|
||||||
|
pdf_filename = "/tmp/input_file.pdf"
|
||||||
|
elif conversion["type"] == "convert":
|
||||||
|
print_flush(f"Converting to PDF using GraphicsMagick")
|
||||||
|
args = [
|
||||||
|
"gm",
|
||||||
|
"convert",
|
||||||
|
"/tmp/input_file",
|
||||||
|
"/tmp/input_file.pdf",
|
||||||
|
]
|
||||||
|
try:
|
||||||
|
p = subprocess.run(args, timeout=60)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print_flush(
|
||||||
|
"Error converting document to PDF, GraphicsMagick timed out after 60 seconds"
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
if p.returncode != 0:
|
||||||
|
print_flush(f"Conversion to PDF failed: {p.stdout}")
|
||||||
|
sys.exit(1)
|
||||||
|
pdf_filename = "/tmp/input_file.pdf"
|
||||||
|
else:
|
||||||
|
print_flush("Invalid conversion type")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Separate PDF into pages
|
||||||
|
print_flush("")
|
||||||
|
print_flush(f"Separating document into pages")
|
||||||
|
args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"]
|
||||||
|
try:
|
||||||
|
p = subprocess.run(args, timeout=60)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print_flush(
|
||||||
|
"Error separating document into pages, pdfseparate timed out after 60 seconds"
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
if p.returncode != 0:
|
||||||
|
print_flush(f"Separating document into pages failed: {p.stdout}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
page_filenames = glob.glob("/tmp/page-*.pdf")
|
||||||
|
print_flush(f"Document has {len(page_filenames)} pages")
|
||||||
|
print_flush("")
|
||||||
|
|
||||||
|
# Convert to RGB pixel data
|
||||||
|
for page in range(1, len(page_filenames) + 1):
|
||||||
|
pdf_filename = f"/tmp/page-{page}.pdf"
|
||||||
|
png_filename = f"/tmp/page-{page}.png"
|
||||||
|
rgb_filename = f"/tmp/page-{page}.rgb"
|
||||||
|
width_filename = f"/tmp/page-{page}.width"
|
||||||
|
height_filename = f"/tmp/page-{page}.height"
|
||||||
|
filename_base = f"/tmp/page-{page}"
|
||||||
|
|
||||||
|
print_flush(f"Converting page {page} to pixels")
|
||||||
|
|
||||||
|
# Convert to png
|
||||||
|
try:
|
||||||
|
p = subprocess.run(
|
||||||
|
["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base],
|
||||||
|
timeout=60,
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print_flush(
|
||||||
|
"Error converting from PDF to PNG, pdftocairo timed out after 60 seconds"
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
if p.returncode != 0:
|
||||||
|
print_flush(f"Conversion from PDF to PNG failed: {p.stdout}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Save the width and height
|
||||||
|
im = Image.open(png_filename)
|
||||||
|
width, height = im.size
|
||||||
|
with open(width_filename, "w") as f:
|
||||||
|
f.write(str(width))
|
||||||
|
with open(height_filename, "w") as f:
|
||||||
|
f.write(str(height))
|
||||||
|
|
||||||
|
# Convert to RGB pixels
|
||||||
|
try:
|
||||||
|
p = subprocess.run(
|
||||||
|
["gm", "convert", png_filename, "-depth", "8", f"rgb:{rgb_filename}"],
|
||||||
|
timeout=60,
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print_flush(
|
||||||
|
"Error converting from PNG to pixels, convert timed out after 60 seconds"
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
if p.returncode != 0:
|
||||||
|
print_flush(f"Conversion from PNG to RGB failed: {p.stdout}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Delete the png
|
||||||
|
os.remove(png_filename)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
17
dangerzone-converter/scripts/pixels-to-pdf
Executable file
17
dangerzone-converter/scripts/pixels-to-pdf
Executable file
|
@ -0,0 +1,17 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
# Remove this warning by setting the host in /etc/hosts:
|
||||||
|
# sudo: unable to resolve host 8160b021d811: Temporary failure in name resolution
|
||||||
|
echo 127.0.0.1 $(hostname) >> /etc/hosts
|
||||||
|
|
||||||
|
# Do the conversion without root
|
||||||
|
/usr/bin/sudo OCR=$OCR OCR_LANGUAGE=$OCR_LANGUAGE -u user /usr/local/bin/pixels-to-pdf-unpriv
|
||||||
|
RETURN_CODE=$?
|
||||||
|
if [ $RETURN_CODE -ne 0 ]; then
|
||||||
|
echo ""
|
||||||
|
exit $RETURN_CODE
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Move converted files into /safezone
|
||||||
|
/bin/mv /tmp/safe-output.pdf /safezone
|
||||||
|
/bin/mv /tmp/safe-output-compressed.pdf /safezone
|
139
dangerzone-converter/scripts/pixels-to-pdf-unpriv
Executable file
139
dangerzone-converter/scripts/pixels-to-pdf-unpriv
Executable file
|
@ -0,0 +1,139 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
|
def print_flush(s=""):
|
||||||
|
print(s)
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
num_pages = len(glob.glob("/dangerzone/page-*.rgb"))
|
||||||
|
print_flush(f"Document has {num_pages} pages")
|
||||||
|
|
||||||
|
# Convert RGB files to PDF files
|
||||||
|
for page in range(1, num_pages + 1):
|
||||||
|
filename_base = f"/dangerzone/page-{page}"
|
||||||
|
rgb_filename = f"{filename_base}.rgb"
|
||||||
|
width_filename = f"{filename_base}.width"
|
||||||
|
height_filename = f"{filename_base}.height"
|
||||||
|
png_filename = f"/tmp/page-{page}.png"
|
||||||
|
ocr_filename = f"/tmp/page-{page}"
|
||||||
|
pdf_filename = f"/tmp/page-{page}.pdf"
|
||||||
|
|
||||||
|
with open(width_filename) as f:
|
||||||
|
width = f.read().strip()
|
||||||
|
with open(height_filename) as f:
|
||||||
|
height = f.read().strip()
|
||||||
|
|
||||||
|
if os.environ.get("OCR") == "1":
|
||||||
|
# OCR the document
|
||||||
|
print_flush(f"Converting page {page} from pixels to searchable PDF")
|
||||||
|
|
||||||
|
args = [
|
||||||
|
"gm",
|
||||||
|
"convert",
|
||||||
|
"-size",
|
||||||
|
f"{width}x{height}",
|
||||||
|
"-depth",
|
||||||
|
"8",
|
||||||
|
f"rgb:{rgb_filename}",
|
||||||
|
f"png:{png_filename}",
|
||||||
|
]
|
||||||
|
try:
|
||||||
|
p = subprocess.run(args, timeout=60)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print_flush(
|
||||||
|
"Error converting pixels to PNG, convert timed out after 60 seconds"
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
if p.returncode != 0:
|
||||||
|
print_flush(f"Page {page} conversion failed: {p.stdout}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
args = [
|
||||||
|
"tesseract",
|
||||||
|
png_filename,
|
||||||
|
ocr_filename,
|
||||||
|
"-l",
|
||||||
|
os.environ.get("OCR_LANGUAGE"),
|
||||||
|
"--dpi",
|
||||||
|
"70",
|
||||||
|
"pdf"
|
||||||
|
]
|
||||||
|
try:
|
||||||
|
p = subprocess.run(args, timeout=60)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print_flush(
|
||||||
|
"Error converting PNG to searchable PDF, tesseract timed out after 60 seconds"
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
if p.returncode != 0:
|
||||||
|
print_flush(f"Page {page} conversion failed: {p.stdout}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Don't OCR
|
||||||
|
print_flush(f"Converting page {page} from pixels to PDF")
|
||||||
|
|
||||||
|
args = [
|
||||||
|
"gm",
|
||||||
|
"convert",
|
||||||
|
"-size",
|
||||||
|
f"{width}x{height}",
|
||||||
|
"-depth",
|
||||||
|
"8",
|
||||||
|
f"rgb:{rgb_filename}",
|
||||||
|
f"pdf:{pdf_filename}",
|
||||||
|
]
|
||||||
|
try:
|
||||||
|
p = subprocess.run(args, timeout=60)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print_flush(
|
||||||
|
"Error converting RGB to PDF, convert timed out after 60 seconds"
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
if p.returncode != 0:
|
||||||
|
print_flush(f"Page {page} conversion failed: {p.stdout}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print_flush()
|
||||||
|
|
||||||
|
# Merge pages into a single PDF
|
||||||
|
print_flush(f"Merging {num_pages} pages into a single PDF")
|
||||||
|
args = ["pdfunite"]
|
||||||
|
for page in range(1, num_pages + 1):
|
||||||
|
args.append(f"/tmp/page-{page}.pdf")
|
||||||
|
args.append(f"/tmp/safe-output.pdf")
|
||||||
|
try:
|
||||||
|
p = subprocess.run(args, timeout=60)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print_flush(
|
||||||
|
"Error merging pages into a single PDF, pdfunite timed out after 60 seconds"
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
if p.returncode != 0:
|
||||||
|
print_flush(f"Merge failed: {p.stdout}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Compress
|
||||||
|
print_flush("Compressing PDF")
|
||||||
|
compress_timeout = num_pages * 3
|
||||||
|
try:
|
||||||
|
p = subprocess.run(
|
||||||
|
["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"],
|
||||||
|
timeout=compress_timeout,
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print_flush(f"Error compressing PDF, ps2pdf timed out after {compress_timeout} seconds")
|
||||||
|
sys.exit(1)
|
||||||
|
if p.returncode != 0:
|
||||||
|
print_flush(f"Compression failed: {p.stdout}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in a new issue