mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00
94 lines
3.3 KiB
Python
94 lines
3.3 KiB
Python
import hashlib
|
|
import io
|
|
import json
|
|
import logging
|
|
import pathlib
|
|
import subprocess
|
|
import sys
|
|
import tarfile
|
|
import urllib.request
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
TESSDATA_RELEASES_URL = (
|
|
"https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest"
|
|
)
|
|
TESSDATA_ARCHIVE_URL = "https://github.com/tesseract-ocr/tessdata_fast/archive/{tessdata_version}/tessdata_fast-{tessdata_version}.tar.gz"
|
|
TESSDATA_CHECKSUM = "d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9"
|
|
|
|
|
|
def git_root():
|
|
"""Get the root directory of the Git repo."""
|
|
# FIXME: Use a Git Python binding for this.
|
|
# FIXME: Make this work if called outside the repo.
|
|
cmd = ["git", "rev-parse", "--show-toplevel"]
|
|
path = (
|
|
subprocess.run(cmd, check=True, stdout=subprocess.PIPE)
|
|
.stdout.decode()
|
|
.strip("\n")
|
|
)
|
|
return pathlib.Path(path)
|
|
|
|
|
|
def main():
|
|
logging.basicConfig(
|
|
level=logging.DEBUG,
|
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
)
|
|
|
|
share_dir = git_root() / "share"
|
|
tessdata_dir = share_dir / "tessdata"
|
|
|
|
# Get the list of OCR languages that Dangerzone supports.
|
|
with open(share_dir / "ocr-languages.json") as f:
|
|
langs_short = sorted(json.loads(f.read()).values())
|
|
|
|
# Check if these languages have already been downloaded.
|
|
if tessdata_dir.exists():
|
|
expected_files = {f"{lang}.traineddata" for lang in langs_short}
|
|
files = {f.name for f in tessdata_dir.iterdir()}
|
|
if files == expected_files:
|
|
logger.info("Skipping tessdata download, language data already exists")
|
|
return
|
|
elif not files:
|
|
logger.info("Tesseract dir is empty, proceeding to download language data")
|
|
else:
|
|
logger.info(f"Found {tessdata_dir} but contents do not match")
|
|
return 1
|
|
|
|
# Get latest release of Tesseract data.
|
|
logger.info("Getting latest tessdata release")
|
|
with urllib.request.urlopen(TESSDATA_RELEASES_URL) as f:
|
|
resp = f.read()
|
|
releases = json.loads(resp)
|
|
tag = releases["tag_name"]
|
|
|
|
# Get latest release of Tesseract data.
|
|
logger.info(f"Downloading tessdata release {tag}")
|
|
archive_url = TESSDATA_ARCHIVE_URL.format(tessdata_version=tag)
|
|
with urllib.request.urlopen(archive_url) as f:
|
|
archive = f.read()
|
|
digest = hashlib.sha256(archive).hexdigest()
|
|
if digest != TESSDATA_CHECKSUM:
|
|
raise RuntimeError(f"Checksum mismatch {digest} != {TESSDATA_CHECKSUM}")
|
|
|
|
# Extract the languages models from the tessdata archive.
|
|
logger.info(f"Extracting tessdata archive into {tessdata_dir}")
|
|
with tarfile.open(fileobj=io.BytesIO(archive)) as t:
|
|
for lang in langs_short:
|
|
member = f"tessdata_fast-{tag}/{lang}.traineddata"
|
|
logger.info(f"Extracting {member}")
|
|
# NOTE: We want `filter="data"` because it ignores ownership info, as
|
|
# recorded in the tarfile. This filter will become the default in Python
|
|
# 3.14. See:
|
|
#
|
|
# https://docs.python.org/3/library/tarfile.html#tarfile-extraction-filter
|
|
t.extract(member=member, path=share_dir, filter="data")
|
|
|
|
tessdata_dl_dir = share_dir / f"tessdata_fast-{tag}"
|
|
tessdata_dl_dir.rename(tessdata_dir)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|