dangerzone/install/common/download-tessdata.py
Alex Pyrgiotis 0921cc23e7
Add script for downloading Tesseract data
Add a Python script that can run in all supported platforms, and can
download and extract the Tesseract language data from GitHub, while
also:

1. Checking that the expected hash matches.
2. Informing the user if the language data have already been downloaded.
3. Extracting only the subset of language data that Dangerzone needs
2024-10-08 19:10:02 +03:00

80 lines
2.8 KiB
Python

import hashlib
import io
import json
import pathlib
import re
import subprocess
import sys
import tarfile
import urllib.request
TESSDATA_RELEASES_URL = (
"https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest"
)
TESSDATA_ARCHIVE_URL = "https://github.com/tesseract-ocr/tessdata_fast/archive/{tessdata_version}/tessdata_fast-{tessdata_version}.tar.gz"
TESSDATA_CHECKSUM = "d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9"
def git_root():
"""Get the root directory of the Git repo."""
# FIXME: Use a Git Python binding for this.
# FIXME: Make this work if called outside the repo.
cmd = ["git", "rev-parse", "--show-toplevel"]
path = (
subprocess.run(cmd, check=True, stdout=subprocess.PIPE)
.stdout.decode()
.strip("\n")
)
return pathlib.Path(path)
def main():
share_dir = git_root() / "share"
tessdata_dir = share_dir / "tessdata"
# Get the list of OCR languages that Dangerzone supports.
with open(share_dir / "ocr-languages.json") as f:
langs_short = sorted(json.loads(f.read()).values())
# Check if these languages have already been downloaded.
if tessdata_dir.exists():
expected_files = {f"{lang}.traineddata" for lang in langs_short}
files = {f.name for f in tessdata_dir.iterdir()}
if files == expected_files:
msg = "> Skipping tessdata download, language data already exists"
print(msg, file=sys.stderr)
return
else:
print(f"Found {tessdata_dir} but contents do not match", file=sys.stderr)
return 1
# Get latest release of Tesseract data.
print(f"> Getting latest tessdata release", file=sys.stderr)
with urllib.request.urlopen(TESSDATA_RELEASES_URL) as f:
resp = f.read()
releases = json.loads(resp)
tag = releases["tag_name"]
# Get latest release of Tesseract data.
print(f"> Downloading tessdata release {tag}", file=sys.stderr)
archive_url = TESSDATA_ARCHIVE_URL.format(tessdata_version=tag)
with urllib.request.urlopen(archive_url) as f:
archive = f.read()
digest = hashlib.sha256(archive).hexdigest()
if digest != TESSDATA_CHECKSUM:
raise RuntimeError(f"Checksum mismatch {digest} != {TESSDATA_CHECKSUM}")
# Extract the languages models from the tessdata archive.
print(f"> Extracting tessdata archive into {tessdata_dir}", file=sys.stderr)
with tarfile.open(fileobj=io.BytesIO(archive)) as t:
for lang in langs_short:
member = f"tessdata_fast-{tag}/{lang}.traineddata"
print(f">> Extracting {member}")
t.extract(member=member, path=share_dir, set_attrs=False)
tessdata_dl_dir = share_dir / f"tessdata_fast-{tag}"
tessdata_dl_dir.rename(tessdata_dir)
if __name__ == "__main__":
sys.exit(main())