FIXUP: Replace print statements with logging

This commit is contained in:
Alex Pyrgiotis 2024-10-09 21:54:42 +03:00
parent 8db9261ccf
commit c37ff7322d
No known key found for this signature in database
GPG key ID: B6C15EBA0357C9AA

View file

@ -1,13 +1,15 @@
import hashlib import hashlib
import io import io
import json import json
import logging
import pathlib import pathlib
import re
import subprocess import subprocess
import sys import sys
import tarfile import tarfile
import urllib.request import urllib.request
logger = logging.getLogger(__name__)
TESSDATA_RELEASES_URL = ( TESSDATA_RELEASES_URL = (
"https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest" "https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest"
) )
@ -29,6 +31,12 @@ def git_root():
def main(): def main():
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
share_dir = git_root() / "share" share_dir = git_root() / "share"
tessdata_dir = share_dir / "tessdata" tessdata_dir = share_dir / "tessdata"
@ -41,22 +49,21 @@ def main():
expected_files = {f"{lang}.traineddata" for lang in langs_short} expected_files = {f"{lang}.traineddata" for lang in langs_short}
files = {f.name for f in tessdata_dir.iterdir()} files = {f.name for f in tessdata_dir.iterdir()}
if files == expected_files: if files == expected_files:
msg = "> Skipping tessdata download, language data already exists" logger.info("Skipping tessdata download, language data already exists")
print(msg, file=sys.stderr)
return return
else: else:
print(f"Found {tessdata_dir} but contents do not match", file=sys.stderr) logger.info(f"Found {tessdata_dir} but contents do not match")
return 1 return 1
# Get latest release of Tesseract data. # Get latest release of Tesseract data.
print(f"> Getting latest tessdata release", file=sys.stderr) logger.info("Getting latest tessdata release")
with urllib.request.urlopen(TESSDATA_RELEASES_URL) as f: with urllib.request.urlopen(TESSDATA_RELEASES_URL) as f:
resp = f.read() resp = f.read()
releases = json.loads(resp) releases = json.loads(resp)
tag = releases["tag_name"] tag = releases["tag_name"]
# Get latest release of Tesseract data. # Get latest release of Tesseract data.
print(f"> Downloading tessdata release {tag}", file=sys.stderr) logger.info(f"Downloading tessdata release {tag}")
archive_url = TESSDATA_ARCHIVE_URL.format(tessdata_version=tag) archive_url = TESSDATA_ARCHIVE_URL.format(tessdata_version=tag)
with urllib.request.urlopen(archive_url) as f: with urllib.request.urlopen(archive_url) as f:
archive = f.read() archive = f.read()
@ -65,12 +72,11 @@ def main():
raise RuntimeError(f"Checksum mismatch {digest} != {TESSDATA_CHECKSUM}") raise RuntimeError(f"Checksum mismatch {digest} != {TESSDATA_CHECKSUM}")
# Extract the languages models from the tessdata archive. # Extract the languages models from the tessdata archive.
print(f"> Extracting tessdata archive into {tessdata_dir}", file=sys.stderr) logger.info(f"Extracting tessdata archive into {tessdata_dir}")
with tarfile.open(fileobj=io.BytesIO(archive)) as t: with tarfile.open(fileobj=io.BytesIO(archive)) as t:
for lang in langs_short: for lang in langs_short:
member = f"tessdata_fast-{tag}/{lang}.traineddata" member = f"tessdata_fast-{tag}/{lang}.traineddata"
print(f">> Extracting {member}") logger.info(f"Extracting {member}")
t.extract(member=member, path=share_dir, set_attrs=False)
tessdata_dl_dir = share_dir / f"tessdata_fast-{tag}" tessdata_dl_dir = share_dir / f"tessdata_fast-{tag}"
tessdata_dl_dir.rename(tessdata_dir) tessdata_dl_dir.rename(tessdata_dir)