Switch to tessdata-fast Tesseract model

Switch to the tessdata-fast Tesseract model, instead of the tessdata
one. The tessdata-fast Tesseract model is much smaller, and a bit faster
than the other one. Also, it's the model that Debian/Fedora ship by
default.

Closes #545
This commit is contained in:
Alex Pyrgiotis 2023-09-18 12:30:34 +03:00
parent e64d1da61f
commit cbca9110ca
No known key found for this signature in database
GPG key ID: B6C15EBA0357C9AA
2 changed files with 5 additions and 9 deletions

View file

@ -1,6 +1,6 @@
FROM alpine:edge
ARG TESSDATA_CHECKSUM=990fffb9b7a9b52dc9a2d053a9ef6852ca2b72bd8dfb22988b0b990a700fd3c7
ARG TESSDATA_CHECKSUM=d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9
ARG H2ORESTART_CHECKSUM=5db816a1e57b510456633f55e693cb5ef3675ef8b35df4f31c90ab9d4c66071a
# Install dependencies
@ -26,11 +26,11 @@ RUN apk --no-cache -U upgrade && \
#
# Before we untar the models, we also check if the checksum is the expected one.
RUN mkdir tessdata && cd tessdata \
&& TESSDATA_VERSION=$(wget -O- -nv https://api.github.com/repos/tesseract-ocr/tessdata/releases/latest \
&& TESSDATA_VERSION=$(wget -O- -nv https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest \
| sed -n 's/^.*"tag_name": "\([0-9.]\+\)".*$/\1/p') \
&& wget https://github.com/tesseract-ocr/tessdata/archive/$TESSDATA_VERSION/tessdata-$TESSDATA_VERSION.tar.gz \
&& echo "$TESSDATA_CHECKSUM tessdata-$TESSDATA_VERSION.tar.gz" | sha256sum -c \
&& tar -xzvf tessdata-$TESSDATA_VERSION.tar.gz -C . \
&& wget https://github.com/tesseract-ocr/tessdata_fast/archive/$TESSDATA_VERSION/tessdata_fast-$TESSDATA_VERSION.tar.gz \
&& echo "$TESSDATA_CHECKSUM tessdata_fast-$TESSDATA_VERSION.tar.gz" | sha256sum -c \
&& tar -xzvf tessdata_fast-$TESSDATA_VERSION.tar.gz -C . \
&& find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata \; \
&& cd .. && rm -r tessdata

View file

@ -22,9 +22,7 @@
"Corsican": "cos",
"Welsh": "cym",
"Danish": "dan",
"Danish - Fraktur": "dan_frak",
"German": "deu",
"German - Fraktur": "deu_frak",
"Divehi": "div",
"Dzongkha": "dzo",
"Greek": "ell",
@ -97,7 +95,6 @@
"Sanskrit": "san",
"Sinhala": "sin",
"Slovakian": "slk",
"Slovak - Fraktur": "slk_frak",
"Slovenian": "slv",
"Sindhi": "snd",
"Spanish": "spa",
@ -113,7 +110,6 @@
"Tatar": "tat",
"Telugu": "tel",
"Tajik": "tgk",
"Tagalog (new - Filipino)": "tgl",
"Thai": "tha",
"Tigrinya": "tir",
"Tonga": "ton",