mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 09:52:37 +02:00
Switch to tessdata-fast Tesseract model
Switch to the tessdata-fast Tesseract model, instead of the tessdata one. The tessdata-fast Tesseract model is much smaller, and a bit faster than the other one. Also, it's the model that Debian/Fedora ship by default. Closes #545
This commit is contained in:
parent
e64d1da61f
commit
cbca9110ca
2 changed files with 5 additions and 9 deletions
10
Dockerfile
10
Dockerfile
|
@ -1,6 +1,6 @@
|
|||
FROM alpine:edge
|
||||
|
||||
ARG TESSDATA_CHECKSUM=990fffb9b7a9b52dc9a2d053a9ef6852ca2b72bd8dfb22988b0b990a700fd3c7
|
||||
ARG TESSDATA_CHECKSUM=d0e3bb6f3b4e75748680524a1d116f2bfb145618f8ceed55b279d15098a530f9
|
||||
ARG H2ORESTART_CHECKSUM=5db816a1e57b510456633f55e693cb5ef3675ef8b35df4f31c90ab9d4c66071a
|
||||
|
||||
# Install dependencies
|
||||
|
@ -26,11 +26,11 @@ RUN apk --no-cache -U upgrade && \
|
|||
#
|
||||
# Before we untar the models, we also check if the checksum is the expected one.
|
||||
RUN mkdir tessdata && cd tessdata \
|
||||
&& TESSDATA_VERSION=$(wget -O- -nv https://api.github.com/repos/tesseract-ocr/tessdata/releases/latest \
|
||||
&& TESSDATA_VERSION=$(wget -O- -nv https://api.github.com/repos/tesseract-ocr/tessdata_fast/releases/latest \
|
||||
| sed -n 's/^.*"tag_name": "\([0-9.]\+\)".*$/\1/p') \
|
||||
&& wget https://github.com/tesseract-ocr/tessdata/archive/$TESSDATA_VERSION/tessdata-$TESSDATA_VERSION.tar.gz \
|
||||
&& echo "$TESSDATA_CHECKSUM tessdata-$TESSDATA_VERSION.tar.gz" | sha256sum -c \
|
||||
&& tar -xzvf tessdata-$TESSDATA_VERSION.tar.gz -C . \
|
||||
&& wget https://github.com/tesseract-ocr/tessdata_fast/archive/$TESSDATA_VERSION/tessdata_fast-$TESSDATA_VERSION.tar.gz \
|
||||
&& echo "$TESSDATA_CHECKSUM tessdata_fast-$TESSDATA_VERSION.tar.gz" | sha256sum -c \
|
||||
&& tar -xzvf tessdata_fast-$TESSDATA_VERSION.tar.gz -C . \
|
||||
&& find . -name '*.traineddata' -maxdepth 2 -exec cp {} /usr/share/tessdata \; \
|
||||
&& cd .. && rm -r tessdata
|
||||
|
||||
|
|
|
@ -22,9 +22,7 @@
|
|||
"Corsican": "cos",
|
||||
"Welsh": "cym",
|
||||
"Danish": "dan",
|
||||
"Danish - Fraktur": "dan_frak",
|
||||
"German": "deu",
|
||||
"German - Fraktur": "deu_frak",
|
||||
"Divehi": "div",
|
||||
"Dzongkha": "dzo",
|
||||
"Greek": "ell",
|
||||
|
@ -97,7 +95,6 @@
|
|||
"Sanskrit": "san",
|
||||
"Sinhala": "sin",
|
||||
"Slovakian": "slk",
|
||||
"Slovak - Fraktur": "slk_frak",
|
||||
"Slovenian": "slv",
|
||||
"Sindhi": "snd",
|
||||
"Spanish": "spa",
|
||||
|
@ -113,7 +110,6 @@
|
|||
"Tatar": "tat",
|
||||
"Telugu": "tel",
|
||||
"Tajik": "tgk",
|
||||
"Tagalog (new - Filipino)": "tgl",
|
||||
"Thai": "tha",
|
||||
"Tigrinya": "tir",
|
||||
"Tonga": "ton",
|
||||
|
|
Loading…
Reference in a new issue