From 06697248fa762af9a5a8f3acb088c3d4e77af96b Mon Sep 17 00:00:00 2001
From: Alex Pyrgiotis <alex.p@freedom.press>
Date: Mon, 5 May 2025 15:47:55 +0300
Subject: [PATCH] WIP: first draft

---
 dev_scripts/inventory.py | 608 +++++++++++++++++++++++++++++++++++++++
 inventory.toml           | 170 +++++++++++
 2 files changed, 778 insertions(+)
 create mode 100755 dev_scripts/inventory.py
 create mode 100644 inventory.toml

diff --git a/dev_scripts/inventory.py b/dev_scripts/inventory.py
new file mode 100755
index 0000000..92cbaa2
--- /dev/null
+++ b/dev_scripts/inventory.py
@@ -0,0 +1,608 @@
+#!/usr/bin/env python3
+"""
+GitHub assets inventory
+
+This script keeps an inventory of assets (currently GitHub release assets) in a TOML
+file, resolves their versions (via GitHub API and semver ranges), calculates file
+checksums, and downloads assets based on a JSON “lock” file.
+"""
+
+import argparse
+import fnmatch
+import hashlib
+import json
+import os
+import platform
+import shutil
+import sys
+import tarfile
+import zipfile
+from pathlib import Path
+
+import requests
+import semver
+import toml
+from platformdirs import user_cache_dir
+
+# CONSTANTS
+CONFIG_FILE = "inventory.toml"
+LOCK_FILE = "inventory.lock"
+GITHUB_API_URL = "https://api.github.com"
+
+# Determine the cache directory using platformdirs
+CACHE_ROOT = Path(user_cache_dir("gh_assets_manager"))
+
+
+# HELPER FUNCTIONS
+def read_config():
+    try:
+        with open(CONFIG_FILE, "r") as fp:
+            return toml.load(fp)
+    except Exception as e:
+        print(f"Could not load configuration file: {e}")
+        sys.exit(1)
+
+
+def check_lock_stale(lock):
+    config = read_config()
+    config_hash = hashlib.sha256(json.dumps(config).encode()).hexdigest()
+    if config_hash != lock["config_checksum"]:
+        raise Exception(
+            "You have made changes to the inventory since you last updated the lock"
+            " file. You need to run the 'lock' command again."
+        )
+
+
+def write_lock(lock_data):
+    with open(LOCK_FILE, "w") as fp:
+        json.dump(lock_data, fp, indent=2)
+
+
+def load_lock(check=True):
+    try:
+        with open(LOCK_FILE, "r") as fp:
+            lock = json.load(fp)
+            if check:
+                check_lock_stale(lock)
+            return lock
+    except Exception as e:
+        print(f"Could not load lock file: {e}")
+        sys.exit(1)
+
+
+def calc_checksum(stream):
+    """
+    Calculate a SHA256 hash of a binary stream by reading 1MiB intervals.
+    """
+    h = hashlib.sha256()
+    for chunk in iter(lambda: stream.read(1024**3), b""):
+        h.update(chunk)
+    return h.hexdigest()
+
+
+def cache_file_path(url):
+    """
+    Generate a safe cache file path for a given URL,
+    using sha256(url) as filename.
+    """
+    url_hash = hashlib.sha256(url.encode("utf-8")).hexdigest()
+    CACHE_ROOT.mkdir(parents=True, exist_ok=True)
+    return CACHE_ROOT / url_hash
+
+
+def store_checksum_in_cache(url, checksum):
+    """Store the checksum in a file whose name is based on the URL hash."""
+    checksum_path = cache_file_path(url).with_suffix(".sha256")
+    with open(checksum_path, "w") as fp:
+        fp.write(checksum)
+
+
+def read_checksum_from_cache(url):
+    checksum_path = cache_file_path(url).with_suffix(".sha256")
+    if checksum_path.exists():
+        return checksum_path.read_text().strip()
+    return None
+
+
+def get_cached_url(url):
+    """
+    If the URL exists in our local cache, return the file path;
+    otherwise return None.
+    """
+    file_path = cache_file_path(url)
+    if file_path.exists():
+        return file_path
+    return None
+
+
+def download_to_cache(url):
+    """
+    Download an asset from the given URL to the cache directory.
+    If the asset already exists in the cache, return its path.
+    Otherwise, download it, store a parallel .sha256 (with the computed hash)
+    and return its path.
+    """
+    cached = get_cached_url(url)
+    if cached:
+        return cached
+
+    print(f"Downloading {url} into cache...")
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+
+    cached = cache_file_path(url)
+    with open(cached, "wb") as f:
+        shutil.copyfileobj(response.raw, f)
+    # Calculate and store checksum in cache
+    with open(cached, "rb") as f:
+        checksum = calc_checksum(f)
+    store_checksum_in_cache(url, checksum)
+    print("Download to cache completed.")
+    return cached
+
+
+def detect_platform():
+    # Return a string like 'windows/amd64' or 'linux/amd64' or 'darwin/amd64'
+    sys_platform = sys.platform
+    if sys_platform.startswith("win"):
+        os_name = "windows"
+    elif sys_platform.startswith("linux"):
+        os_name = "linux"
+    elif sys_platform.startswith("darwin"):
+        os_name = "darwin"
+    else:
+        os_name = sys_platform
+
+    machine = platform.machine().lower()
+    # Normalize architecture names
+    arch = {"x86_64": "amd64", "amd64": "amd64", "arm64": "arm64"}.get(machine, machine)
+    return f"{os_name}/{arch}"
+
+
+def get_latest_release(repo, semver_range):
+    """
+    Query the GitHub API for repo releases, parse semver, and choose the
+    latest release matching the given semver_range string (e.g., ">=1.0.1", "==1.2.2").
+    """
+    url = f"{GITHUB_API_URL}/repos/{repo}/releases"
+    response = requests.get(url)
+    if response.status_code != 200:
+        print(f"Failed to fetch releases for repo {repo}. HTTP {response.status_code}")
+        return None
+    releases = response.json()
+
+    matching = []
+    for release in releases:
+        tag = release.get("tag_name", "")
+        # Strip any prefix 'v' if necessary
+        version_str = tag.lstrip("v")
+        try:
+            version = semver.VersionInfo.parse(version_str)
+            # Skip prereleases and non-matching versions
+            if release["prerelease"] or not version.match(semver_range):
+                continue
+            matching.append((release, version))
+        except ValueError:
+            continue
+
+    if not matching:
+        print(f"No releases match version requirement {semver_range} for repo {repo}")
+        return None
+
+    return max(matching, key=lambda x: x[1])[0]
+
+
+def resolve_asset_for_platform(release, name):
+    """
+    Given the release JSON and an asset name, find the asset download URL by matching
+    filename.  If the asset name contains "{version}", it will be formatted using the
+    release tag.
+    """
+    if name == "!tarball":
+        return release["tarball_url"]
+    elif name == "!zipball":
+        return release["zipball_url"]
+
+    # Format the name with the found version, in case it requires it.
+    version = release.get("tag_name").lstrip("v")
+    expected_name = name.format(version=version)
+
+    assets = release.get("assets", [])
+    for asset in assets:
+        if asset.get("name") == expected_name:
+            return asset.get("browser_download_url")
+    return None
+
+
+def hash_asset(url):
+    """
+    Download the asset using caching and return its SHA256 checksum.
+    The checksum is also stored in the cache as a .sha256 file.
+    """
+    cached_file = download_to_cache(url)
+    with open(cached_file, "rb") as f:
+        checksum = calc_checksum(f)
+    store_checksum_in_cache(url, checksum)
+    return checksum
+
+
+def download_to_cache_and_verify(url, destination, expected_checksum):
+    """
+    Using caching, first download an asset to the cache dir.
+    Verify its checksum against the expected_checksum.
+    If they match, copy to destination.
+    If not, remove the cached file and raise an exception.
+    """
+    cached_file = download_to_cache(url)
+    with open(cached_file, "rb") as f:
+        computed_checksum = calc_checksum(f)
+
+    if computed_checksum != expected_checksum:
+        # Remove cache file and its checksum file
+        try:
+            cached_file.unlink()
+            checksum_file = cached_file.with_suffix(".sha256")
+            if checksum_file.exists():
+                checksum_file.unlink()
+        except Exception:
+            pass
+        raise Exception(
+            f"Hash mismatch for URL {url}: computed '{computed_checksum}', expected '{expected_checksum}'"
+        )
+    return cached_file
+
+
+def determine_extract_opts(extract):
+    """
+    Determine globs and flatten settings.
+    """
+    if isinstance(extract, dict):
+        globs = extract.get("globs", ["*"])
+        flatten = extract.get("flatten", False)
+    elif isinstance(extract, list):
+        globs = extract
+        flatten = False
+    elif isinstance(extract, bool):
+        globs = ["*"]
+        flatten = False
+    else:
+        raise Exception(f"Unexpected format for 'extract' field: {extract}")
+
+    return {
+        "globs": globs,
+        "flatten": flatten,
+    }
+
+
+def detect_archive_type(name):
+    """
+    Detect the filetype of the archive based on its name.
+    """
+    if name.endswith(".tar.gz") or name.endswith(".tgz") or name == "!tarball":
+        return "tar.gz"
+    if name.endswith(".tar"):
+        return "tar"
+    if name.endswith(".zip") or name == "!zipball":
+        return "zip"
+    raise Exception(f"Unsupported archive type for extraction: {name}")
+
+
+def flatten_extracted_files(destination):
+    """
+    After extraction, move all files found in subdirectories of destination into destination root.
+    """
+    for root, dirs, files in os.walk(destination):
+        # Skip the root directory itself
+        if Path(root) == destination:
+            continue
+        for file in files:
+            src_file = Path(root) / file
+            dst_file = destination / file
+            # If a file with the same name exists, we can overwrite or rename.
+            shutil.move(str(src_file), str(dst_file))
+    # Optionally, remove now-empty subdirectories.
+    for root, dirs, files in os.walk(destination, topdown=False):
+        for d in dirs:
+            dir_path = Path(root) / d
+            try:
+                dir_path.rmdir()
+            except OSError:
+                pass
+
+
+def extract_asset(archive_path, destination, options):
+    """
+    Extract the asset from archive_path to destination.
+
+    Accepts a dictionary withe the following options:
+    * 'globs': A list of patterns that will be used to match members in the archive.
+      If a member does not match a pattern, it will not be extracted.
+    * 'flatten': A boolean value. If true, after extraction, move all files to the
+       destination root.
+    * 'filetype': The type of the archive, which indicates how it will get extracted.
+
+    For tarfiles, use filter="data" when extracting to mitigate malicious tar entries.
+    """
+    ft = options["filetype"]
+    globs = options["globs"]
+    flatten = options["flatten"]
+
+    if ft in ("tar.gz", "tar"):
+        mode = "r:gz" if ft == "tar.gz" else "r"
+        try:
+            with tarfile.open(archive_path, "r:gz") as tar:
+                members = [
+                    m
+                    for m in tar.getmembers()
+                    if any(fnmatch.fnmatch(m.name, glob) for glob in globs)
+                ]
+                if not members:
+                    raise Exception("Globs did not match any files in the archive")
+                tar.extractall(path=destination, members=members, filter="data")
+        except Exception as e:
+            raise Exception(f"Error extracting '{archive_path}': {e}")
+    elif ft == "zip":
+        try:
+            with zipfile.ZipFile(archive_path, "r") as zip_ref:
+                members = [
+                    m
+                    for m in zip_ref.namelist()
+                    if any(fnmatch.fnmatch(m, glob) for glob in globs)
+                ]
+                if not members:
+                    raise Exception("Globs did not match any files in the archive")
+                zip_ref.extractall(path=destination, members=members)
+        except Exception as e:
+            raise Exception(f"Error extracting zip archive: {e}")
+    else:
+        raise Exception(f"Unsupported archive type for file {archive_path}")
+
+    if flatten:
+        flatten_extracted_files(destination)
+
+    print(f"Extraction of {archive_path} complete.")
+
+
+def get_platform_assets(assets, platform):
+    """
+    List the assets that are associated with a specific platform.
+    """
+
+    plat_assets = {}
+    for asset_name, asset_entry in assets.items():
+        if platform in asset_entry:
+            plat_assets[asset_name] = asset_entry[platform]
+        elif "all" in asset_entry:
+            plat_assets[asset_name] = asset_entry["all"]
+    return plat_assets
+
+
+def chmod_exec(path):
+    if path.is_dir():
+        for root, _, files in path.walk():
+            for name in files:
+                f = root / name
+                f.chmod(f.stat().st_mode | 0o111)
+    else:
+        path.chmod(path.stat().st_mode | 0o111)
+
+
+# COMMAND FUNCTIONS
+def cmd_lock(args):
+    """
+    Reads the configuration file, queries GitHub for each asset to determine
+    the actual release and asset URL, calculates checksum if the asset exists locally.
+    Then outputs/updates the lock file (in JSON format).
+
+    Changes:
+      - Uses "destination" instead of "download_path" in the config.
+      - Uses caching when fetching/hashing assets.
+      - Renders asset filenames if they contain {version}.
+      - Supports a 'platform.all' key for platform-agnostic assets.
+    """
+    config = read_config()
+    lock = {"assets": {}}
+    # Expected config structure in config.toml:
+    # [asset.<asset_name>]
+    #   repo = "owner/repo"
+    #   version = ">=1.0.1"  # semver expression
+    #   platform."windows/amd64" = "asset-windows.exe"
+    #   platform."linux/amd64"   = "asset-linux"
+    #   platform.all             = "universal-asset.zip"
+    #   executable = true|false  # whether to mark downloaded file as executable
+    #   destination = "./downloads/asset.exe"
+    #   extract = either false, a list of globs,
+    #             or a table with keys: globs = ["glob1", "glob2"]
+    #             and flatten = True|False.
+    assets_cfg = config.get("asset", {})
+    if not assets_cfg:
+        print("No assets defined under the [asset] section in the config file.")
+        sys.exit(1)
+
+    for asset_name, asset in assets_cfg.items():
+        repo = asset.get("repo")
+        version_range = asset.get("version")
+        asset_map = asset.get("platform")  # mapping platform -> asset file name
+        executable = asset.get("executable")
+        destination_str = asset.get("destination")
+        extract = asset.get("extract", False)
+
+        if extract:
+            extract = determine_extract_opts(extract)
+
+        if not repo or not version_range or not asset_map or not destination_str:
+            print(f"Asset {asset_name} is missing required fields.")
+            continue
+
+        print(f"Processing asset '{asset_name}' for repo '{repo}' ...")
+        release = get_latest_release(repo, version_range)
+        if not release:
+            print(f"Could not resolve release for asset '{asset_name}'.")
+            continue
+
+        lock_assets = lock["assets"]
+        asset_lock_data = {}
+        # Process each defined platform key in the asset_map
+        for plat_key, plat_name in asset_map.items():
+            download_url = resolve_asset_for_platform(release, plat_name)
+            if not download_url:
+                print(
+                    f"Warning: No asset found for platform '{plat_key}' in repo '{repo}' for asset '{asset_name}'."
+                )
+                continue
+
+            if extract:
+                extract = extract.copy()
+                extract["filetype"] = detect_archive_type(plat_name)
+            print(f"Hashing asset '{asset_name}' for platform '{plat_key}'...")
+            checksum = hash_asset(download_url)
+            asset_lock_data[plat_key] = {
+                "repo": repo,
+                "download_url": download_url,
+                "version": release.get("tag_name").lstrip("v"),
+                "checksum": checksum,
+                "executable": executable,
+                "destination": destination_str,
+                "extract": extract,
+            }
+        if not asset_lock_data:
+            print(f"No valid platforms found for asset '{asset_name}'.")
+            continue
+        lock_assets[asset_name] = asset_lock_data
+
+    config_hash = hashlib.sha256(json.dumps(config).encode()).hexdigest()
+    lock["config_checksum"] = config_hash
+    write_lock(lock)
+    print(f"Lock file '{LOCK_FILE}' updated.")
+
+
+def cmd_sync(args):
+    """
+    Sync assets based on the lock file. Accepts an optional platform argument
+    to limit downloads and an optional list of asset names.
+
+    Features:
+      - Uses caching: downloads happen into the cache, then verified against the expected hash,
+        and finally copied to the destination.
+      - If executable field is set, mark the downloaded file(s) as executable.
+      - If extract field is set:
+          o If False or missing: no extraction, just copy.
+          o Otherwise, if extract is set:
+               - If extract is a list, treat it as a list of globs.
+               - If extract is a table, expect keys "globs" and optional "flatten".
+      - For platform-agnostic assets, an entry with key "platform.all" is used if the requested
+        platform is not found.
+    """
+    lock = load_lock()
+    target_plat = args.platform if args.platform else detect_platform()
+    print(f"Target platform: {target_plat}")
+    lock_assets = lock["assets"]
+    asset_list = (
+        args.assets
+        if args.assets
+        else get_platform_assets(lock_assets, target_plat).keys()
+    )
+
+    # Validate asset names and platform entries
+    for asset_name in asset_list:
+        if asset_name not in lock_assets:
+            raise Exception(f"Asset '{asset_name}' not found in the lock file.")
+        asset_entry = lock_assets[asset_name]
+
+        # If an asset.entry contains "platform.all", then we should fallback to that, if
+        # the specific platform we're looking for is not defined.
+        if target_plat not in asset_entry:
+            if "all" in asset_entry:
+                target_plat = "all"
+            else:
+                raise Exception(
+                    f"No entry for platform '{target_plat}' or 'platform.all' in asset"
+                    f" '{asset_name}'"
+                )
+
+        info = asset_entry[target_plat]
+        download_url = info["download_url"]
+        destination = Path(info["destination"])
+        expected_checksum = info["checksum"]
+        executable = info["executable"]
+        extract = info.get("extract", False)
+        try:
+            print(f"Processing asset '{asset_name}' for platform '{target_plat}'...")
+            cached_file = download_to_cache_and_verify(
+                download_url, destination, expected_checksum
+            )
+            # Remove destination if it exists already.
+            if destination.exists():
+                if destination.is_dir():
+                    shutil.rmtree(destination)
+                else:
+                    destination.unlink()
+            # If extraction is requested
+            if extract:
+                destination.mkdir(parents=True, exist_ok=True)
+                filename = download_url.split("/")[-1]
+                extract_asset(
+                    cached_file,
+                    destination,
+                    options=extract,
+                )
+            else:
+                destination.parent.mkdir(parents=True, exist_ok=True)
+                shutil.copy2(cached_file, destination)
+            if executable:
+                chmod_exec(destination)
+        except Exception as e:
+            print(
+                f"Error processing asset '{asset_name}' for platform '{target_plat}': {e}"
+            )
+            continue
+
+    print("Downloads completed")
+
+
+def cmd_list(args):
+    """
+    List assets and their versions based on the lock file and the provided platform. If
+    a platform is not provided, list assets for the current one.
+    """
+    lock = load_lock()
+    target_plat = args.platform if args.platform else detect_platform()
+    assets = get_platform_assets(lock["assets"], target_plat)
+    for asset_name in sorted(assets.keys()):
+        asset = assets[asset_name]
+        print(f"{asset_name} {asset['version']} {asset['download_url']}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="GitHub Release Assets Manager")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    lock_parser = subparsers.add_parser(
+        "lock", help="Update lock file from config (without downloading)"
+    )
+    lock_parser.set_defaults(func=cmd_lock)
+
+    sync_parser = subparsers.add_parser("sync", help="Sync assets as per lock file")
+    sync_parser.add_argument(
+        "--platform",
+        help="Platform name/arch (e.g., windows/amd64) to download assets for",
+    )
+    sync_parser.add_argument(
+        "assets",
+        nargs="*",
+        help="Specific asset names to download. If omitted, download all assets.",
+    )
+    sync_parser.set_defaults(func=cmd_sync)
+
+    list_parser = subparsers.add_parser("list", help="List assets for a platform")
+    list_parser.add_argument(
+        "--platform",
+        help="Platform name/arch (e.g., windows/amd64) to list assets for",
+    )
+    list_parser.set_defaults(func=cmd_list)
+    args = parser.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/inventory.toml b/inventory.toml
new file mode 100644
index 0000000..41add4e
--- /dev/null
+++ b/inventory.toml
@@ -0,0 +1,170 @@
+[asset.podman]
+repo = "containers/podman"
+version = ">=5.4.2"
+platform."windows/amd64" = "podman-remote-release-windows_amd64.zip"
+platform."darwin/amd64" = "podman-remote-release-darwin_amd64.zip"
+platform."darwin/arm64" = "podman-remote-release-darwin_arm64.zip"
+destination = "share/vendor/podman"
+executable = true
+extract.globs = ["**/bin/*"]
+extract.flatten = true
+
+[asset.gvproxy]
+repo = "containers/gvisor-tap-vsock"
+version = ">=0.8.5"
+platform."darwin/amd64" = "gvproxy-darwin"
+platform."darwin/arm64" = "gvproxy-darwin"
+executable = true
+destination = "share/vendor/podman/gvproxy"
+
+[asset.vfkit]
+repo = "crc-org/vfkit"
+version = ">=0.6.1"
+platform."darwin/amd64" = "vfkit"
+platform."darwin/arm64" = "vfkit"
+executable = true
+destination = "share/vendor/podman/vfkit"
+
+[asset.cosign]
+repo = "sigstore/cosign"
+version = ">=2.5.0"
+platform."darwin/amd64" = "cosign-darwin-amd64"
+platform."darwin/arm64" = "cosign-darwin-arm64"
+platform."linux/amd64" = "cosign-linux-amd64"
+platform."windows/amd64" = "cosign-windows-amd64.exe"
+executable = true
+destination = "share/vendor/cosign"
+
+[asset.tessdata]
+repo = "tesseract-ocr/tessdata_fast"
+version = ">=4.0.0"
+platform.all= "!tarball"
+destination = "share/tessdata"
+# HACK: Globs taken with:
+# python -c 'import json; f = open("share/ocr-languages.json"); values = json.loads(f.read()).values(); [print(f"  \"*/{v}.traineddata\",") for v in values]'
+extract.globs = [
+  "*/afr.traineddata",
+  "*/amh.traineddata",
+  "*/ara.traineddata",
+  "*/asm.traineddata",
+  "*/aze.traineddata",
+  "*/aze_cyrl.traineddata",
+  "*/bel.traineddata",
+  "*/ben.traineddata",
+  "*/bod.traineddata",
+  "*/bos.traineddata",
+  "*/bre.traineddata",
+  "*/bul.traineddata",
+  "*/cat.traineddata",
+  "*/ceb.traineddata",
+  "*/ces.traineddata",
+  "*/chi_sim.traineddata",
+  "*/chi_sim_vert.traineddata",
+  "*/chi_tra.traineddata",
+  "*/chi_tra_vert.traineddata",
+  "*/chr.traineddata",
+  "*/cos.traineddata",
+  "*/cym.traineddata",
+  "*/dan.traineddata",
+  "*/deu.traineddata",
+  "*/div.traineddata",
+  "*/dzo.traineddata",
+  "*/ell.traineddata",
+  "*/eng.traineddata",
+  "*/enm.traineddata",
+  "*/epo.traineddata",
+  "*/est.traineddata",
+  "*/eus.traineddata",
+  "*/fao.traineddata",
+  "*/fas.traineddata",
+  "*/fil.traineddata",
+  "*/fin.traineddata",
+  "*/fra.traineddata",
+  "*/frk.traineddata",
+  "*/frm.traineddata",
+  "*/fry.traineddata",
+  "*/gla.traineddata",
+  "*/gle.traineddata",
+  "*/glg.traineddata",
+  "*/grc.traineddata",
+  "*/guj.traineddata",
+  "*/hat.traineddata",
+  "*/heb.traineddata",
+  "*/hin.traineddata",
+  "*/hrv.traineddata",
+  "*/hun.traineddata",
+  "*/hye.traineddata",
+  "*/iku.traineddata",
+  "*/ind.traineddata",
+  "*/isl.traineddata",
+  "*/ita.traineddata",
+  "*/ita_old.traineddata",
+  "*/jav.traineddata",
+  "*/jpn.traineddata",
+  "*/jpn_vert.traineddata",
+  "*/kan.traineddata",
+  "*/kat.traineddata",
+  "*/kat_old.traineddata",
+  "*/kaz.traineddata",
+  "*/khm.traineddata",
+  "*/kir.traineddata",
+  "*/kmr.traineddata",
+  "*/kor.traineddata",
+  "*/kor_vert.traineddata",
+  "*/lao.traineddata",
+  "*/lat.traineddata",
+  "*/lav.traineddata",
+  "*/lit.traineddata",
+  "*/ltz.traineddata",
+  "*/mal.traineddata",
+  "*/mar.traineddata",
+  "*/mkd.traineddata",
+  "*/mlt.traineddata",
+  "*/mon.traineddata",
+  "*/mri.traineddata",
+  "*/msa.traineddata",
+  "*/mya.traineddata",
+  "*/nep.traineddata",
+  "*/nld.traineddata",
+  "*/nor.traineddata",
+  "*/oci.traineddata",
+  "*/ori.traineddata",
+  "*/pan.traineddata",
+  "*/pol.traineddata",
+  "*/por.traineddata",
+  "*/pus.traineddata",
+  "*/que.traineddata",
+  "*/ron.traineddata",
+  "*/rus.traineddata",
+  "*/san.traineddata",
+  "*/sin.traineddata",
+  "*/slk.traineddata",
+  "*/slv.traineddata",
+  "*/snd.traineddata",
+  "*/spa.traineddata",
+  "*/spa_old.traineddata",
+  "*/sqi.traineddata",
+  "*/srp.traineddata",
+  "*/srp_latn.traineddata",
+  "*/sun.traineddata",
+  "*/swa.traineddata",
+  "*/swe.traineddata",
+  "*/syr.traineddata",
+  "*/tam.traineddata",
+  "*/tat.traineddata",
+  "*/tel.traineddata",
+  "*/tgk.traineddata",
+  "*/tha.traineddata",
+  "*/tir.traineddata",
+  "*/ton.traineddata",
+  "*/tur.traineddata",
+  "*/uig.traineddata",
+  "*/ukr.traineddata",
+  "*/urd.traineddata",
+  "*/uzb.traineddata",
+  "*/uzb_cyrl.traineddata",
+  "*/vie.traineddata",
+  "*/yid.traineddata",
+  "*/yor.traineddata",
+]
+extract.flatten = true