Add script for downloading Tesseract data

Add a Python script that can run in all supported platforms, and can
download and extract the Tesseract language data from GitHub, while
also:

1. Checking that the expected hash matches.
2. Informing the user if the language data have already been downloaded.
3. Extracting only the subset of language data that Dangerzone needs
This commit is contained in:
Alex Pyrgiotis 2024-03-14 00:24:49 +02:00
parent 6547998633
commit 0921cc23e7
No known key found for this signature in database
GPG key ID: B6C15EBA0357C9AA
2 changed files with 81 additions and 0 deletions

1
.gitignore vendored
View file

@ -22,6 +22,7 @@ var/
wheels/ wheels/
pip-wheel-metadata/ pip-wheel-metadata/
share/python-wheels/ share/python-wheels/
share/tessdata/
*.egg-info/ *.egg-info/
.installed.cfg .installed.cfg
*.egg *.egg

View file

@ -0,0 +1,80 @@
import hashlib
import io
import json
import pathlib
import re
import subprocess
import sys
import tarfile
import urllib.request
TESSDATA_RELEASES_URL = (
"https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest"
)
TESSDATA_ARCHIVE_URL = "https://github.com/tesseract-ocr/tessdata_fast/archive/{tessdata_version}/tessdata_fast-{tessdata_version}.tar.gz"
TESSDATA_CHECKSUM = "d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9"
def git_root():
"""Get the root directory of the Git repo."""
# FIXME: Use a Git Python binding for this.
# FIXME: Make this work if called outside the repo.
cmd = ["git", "rev-parse", "--show-toplevel"]
path = (
subprocess.run(cmd, check=True, stdout=subprocess.PIPE)
.stdout.decode()
.strip("\n")
)
return pathlib.Path(path)
def main():
share_dir = git_root() / "share"
tessdata_dir = share_dir / "tessdata"
# Get the list of OCR languages that Dangerzone supports.
with open(share_dir / "ocr-languages.json") as f:
langs_short = sorted(json.loads(f.read()).values())
# Check if these languages have already been downloaded.
if tessdata_dir.exists():
expected_files = {f"{lang}.traineddata" for lang in langs_short}
files = {f.name for f in tessdata_dir.iterdir()}
if files == expected_files:
msg = "> Skipping tessdata download, language data already exists"
print(msg, file=sys.stderr)
return
else:
print(f"Found {tessdata_dir} but contents do not match", file=sys.stderr)
return 1
# Get latest release of Tesseract data.
print(f"> Getting latest tessdata release", file=sys.stderr)
with urllib.request.urlopen(TESSDATA_RELEASES_URL) as f:
resp = f.read()
releases = json.loads(resp)
tag = releases["tag_name"]
# Get latest release of Tesseract data.
print(f"> Downloading tessdata release {tag}", file=sys.stderr)
archive_url = TESSDATA_ARCHIVE_URL.format(tessdata_version=tag)
with urllib.request.urlopen(archive_url) as f:
archive = f.read()
digest = hashlib.sha256(archive).hexdigest()
if digest != TESSDATA_CHECKSUM:
raise RuntimeError(f"Checksum mismatch {digest} != {TESSDATA_CHECKSUM}")
# Extract the languages models from the tessdata archive.
print(f"> Extracting tessdata archive into {tessdata_dir}", file=sys.stderr)
with tarfile.open(fileobj=io.BytesIO(archive)) as t:
for lang in langs_short:
member = f"tessdata_fast-{tag}/{lang}.traineddata"
print(f">> Extracting {member}")
t.extract(member=member, path=share_dir, set_attrs=False)
tessdata_dl_dir = share_dir / f"tessdata_fast-{tag}"
tessdata_dl_dir.rename(tessdata_dir)
if __name__ == "__main__":
sys.exit(main())