From 2d6826afa9c58a7be858385b7ea0133fc3c2183b Mon Sep 17 00:00:00 2001 From: deeplow Date: Thu, 15 Sep 2022 09:38:34 +0100 Subject: [PATCH] move ocr_languages from global_common to share/ ocr_languages can be treated as just a json file instead of being in global_common. This way it is easier to maintain and makes global_common cleaner. --- dangerzone/global_common.py | 164 +----------------------------------- share/ocr-languages.json | 162 +++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 162 deletions(-) create mode 100644 share/ocr-languages.json diff --git a/dangerzone/global_common.py b/dangerzone/global_common.py index a9fe958..a55d01d 100644 --- a/dangerzone/global_common.py +++ b/dangerzone/global_common.py @@ -34,168 +34,8 @@ class GlobalCommon(object): self.container_name = "dangerzone.rocks/dangerzone" # Languages supported by tesseract - self.ocr_languages = { - "Afrikaans": "ar", - "Albanian": "sqi", - "Amharic": "amh", - "Arabic": "ara", - "Arabic script": "Arabic", - "Armenian": "hye", - "Armenian script": "Armenian", - "Assamese": "asm", - "Azerbaijani": "aze", - "Azerbaijani (Cyrillic)": "aze_cyrl", - "Basque": "eus", - "Belarusian": "bel", - "Bengali": "ben", - "Bengali script": "Bengali", - "Bosnian": "bos", - "Breton": "bre", - "Bulgarian": "bul", - "Burmese": "mya", - "Canadian Aboriginal script": "Canadian_Aboriginal", - "Catalan": "cat", - "Cebuano": "ceb", - "Cherokee": "chr", - "Cherokee script": "Cherokee", - "Chinese - Simplified": "chi_sim", - "Chinese - Simplified (vertical)": "chi_sim_vert", - "Chinese - Traditional": "chi_tra", - "Chinese - Traditional (vertical)": "chi_tra_vert", - "Corsican": "cos", - "Croatian": "hrv", - "Cyrillic script": "Cyrillic", - "Czech": "ces", - "Danish": "dan", - "Devanagari script": "Devanagari", - "Divehi": "div", - "Dutch": "nld", - "Dzongkha": "dzo", - "English": "eng", - "English, Middle (1100-1500)": "enm", - "Esperanto": "epo", - "Estonian": "est", - "Ethiopic script": "Ethiopic", - "Faroese": "fao", - "Filipino": "fil", - "Finnish": "fin", - "Fraktur script": "Fraktur", - "Frankish": "frk", - "French": "fra", - "French, Middle (ca.1400-1600)": "frm", - "Frisian (Western)": "fry", - "Gaelic (Scots)": "gla", - "Galician": "glg", - "Georgian": "kat", - "Georgian script": "Georgian", - "German": "deu", - "Greek": "ell", - "Greek script": "Greek", - "Gujarati": "guj", - "Gujarati script": "Gujarati", - "Gurmukhi script": "Gurmukhi", - "Hangul script": "Hangul", - "Hangul (vertical) script": "Hangul_vert", - "Han - Simplified script": "HanS", - "Han - Simplified (vertical) script": "HanS_vert", - "Han - Traditional script": "HanT", - "Han - Traditional (vertical) script": "HanT_vert", - "Hatian": "hat", - "Hebrew": "heb", - "Hebrew script": "Hebrew", - "Hindi": "hin", - "Hungarian": "hun", - "Icelandic": "isl", - "Indonesian": "ind", - "Inuktitut": "iku", - "Irish": "gle", - "Italian": "ita", - "Italian - Old": "ita_old", - "Japanese": "jpn", - "Japanese script": "Japanese", - "Japanese (vertical)": "jpn_vert", - "Japanese (vertical) script": "Japanese_vert", - "Javanese": "jav", - "Kannada": "kan", - "Kannada script": "Kannada", - "Kazakh": "kaz", - "Khmer": "khm", - "Khmer script": "Khmer", - "Korean": "kor", - "Korean (vertical)": "kor_vert", - "Kurdish (Arabic)": "kur_ara", - "Kyrgyz": "kir", - "Lao": "lao", - "Lao script": "Lao", - "Latin": "lat", - "Latin script": "Latin", - "Latvian": "lav", - "Lithuanian": "lit", - "Luxembourgish": "ltz", - "Macedonian": "mkd", - "Malayalam": "mal", - "Malayalam script": "Malayalam", - "Malay": "msa", - "Maltese": "mlt", - "Maori": "mri", - "Marathi": "mar", - "Mongolian": "mon", - "Myanmar script": "Myanmar", - "Nepali": "nep", - "Norwegian": "nor", - "Occitan (post 1500)": "oci", - "Old Georgian": "kat_old", - "Oriya (Odia) script": "Oriya", - "Oriya": "ori", - "Pashto": "pus", - "Persian": "fas", - "Polish": "pol", - "Portuguese": "por", - "Punjabi": "pan", - "Quechua": "que", - "Romanian": "ron", - "Russian": "rus", - "Sanskrit": "san", - "script and orientation": "osd", - "Serbian (Latin)": "srp_latn", - "Serbian": "srp", - "Sindhi": "snd", - "Sinhala script": "Sinhala", - "Sinhala": "sin", - "Slovakian": "slk", - "Slovenian": "slv", - "Spanish, Castilian - Old": "spa_old", - "Spanish": "spa", - "Sundanese": "sun", - "Swahili": "swa", - "Swedish": "swe", - "Syriac script": "Syriac", - "Syriac": "syr", - "Tajik": "tgk", - "Tamil script": "Tamil", - "Tamil": "tam", - "Tatar": "tat", - "Telugu script": "Telugu", - "Telugu": "tel", - "Thaana script": "Thaana", - "Thai script": "Thai", - "Thai": "tha", - "Tibetan script": "Tibetan", - "Tibetan Standard": "bod", - "Tigrinya": "tir", - "Tonga": "ton", - "Turkish": "tur", - "Ukrainian": "ukr", - "Urdu": "urd", - "Uyghur": "uig", - "Uzbek (Cyrillic)": "uzb_cyrl", - "Uzbek": "uzb", - "Vietnamese script": "Vietnamese", - "Vietnamese": "vie", - "Welsh": "cym", - "Yiddish": "yid", - "Yoruba": "yor", - } + with open(get_resource_path("ocr-languages.json"), "r") as f: + self.ocr_languages = json.load(f) # Load settings self.settings = Settings(self) diff --git a/share/ocr-languages.json b/share/ocr-languages.json new file mode 100644 index 0000000..be6e686 --- /dev/null +++ b/share/ocr-languages.json @@ -0,0 +1,162 @@ +{ + "Afrikaans": "ar", + "Albanian": "sqi", + "Amharic": "amh", + "Arabic": "ara", + "Arabic script": "Arabic", + "Armenian": "hye", + "Armenian script": "Armenian", + "Assamese": "asm", + "Azerbaijani": "aze", + "Azerbaijani (Cyrillic)": "aze_cyrl", + "Basque": "eus", + "Belarusian": "bel", + "Bengali": "ben", + "Bengali script": "Bengali", + "Bosnian": "bos", + "Breton": "bre", + "Bulgarian": "bul", + "Burmese": "mya", + "Canadian Aboriginal script": "Canadian_Aboriginal", + "Catalan": "cat", + "Cebuano": "ceb", + "Cherokee": "chr", + "Cherokee script": "Cherokee", + "Chinese - Simplified": "chi_sim", + "Chinese - Simplified (vertical)": "chi_sim_vert", + "Chinese - Traditional": "chi_tra", + "Chinese - Traditional (vertical)": "chi_tra_vert", + "Corsican": "cos", + "Croatian": "hrv", + "Cyrillic script": "Cyrillic", + "Czech": "ces", + "Danish": "dan", + "Devanagari script": "Devanagari", + "Divehi": "div", + "Dutch": "nld", + "Dzongkha": "dzo", + "English": "eng", + "English, Middle (1100-1500)": "enm", + "Esperanto": "epo", + "Estonian": "est", + "Ethiopic script": "Ethiopic", + "Faroese": "fao", + "Filipino": "fil", + "Finnish": "fin", + "Fraktur script": "Fraktur", + "Frankish": "frk", + "French": "fra", + "French, Middle (ca.1400-1600)": "frm", + "Frisian (Western)": "fry", + "Gaelic (Scots)": "gla", + "Galician": "glg", + "Georgian": "kat", + "Georgian script": "Georgian", + "German": "deu", + "Greek": "ell", + "Greek script": "Greek", + "Gujarati": "guj", + "Gujarati script": "Gujarati", + "Gurmukhi script": "Gurmukhi", + "Hangul script": "Hangul", + "Hangul (vertical) script": "Hangul_vert", + "Han - Simplified script": "HanS", + "Han - Simplified (vertical) script": "HanS_vert", + "Han - Traditional script": "HanT", + "Han - Traditional (vertical) script": "HanT_vert", + "Hatian": "hat", + "Hebrew": "heb", + "Hebrew script": "Hebrew", + "Hindi": "hin", + "Hungarian": "hun", + "Icelandic": "isl", + "Indonesian": "ind", + "Inuktitut": "iku", + "Irish": "gle", + "Italian": "ita", + "Italian - Old": "ita_old", + "Japanese": "jpn", + "Japanese script": "Japanese", + "Japanese (vertical)": "jpn_vert", + "Japanese (vertical) script": "Japanese_vert", + "Javanese": "jav", + "Kannada": "kan", + "Kannada script": "Kannada", + "Kazakh": "kaz", + "Khmer": "khm", + "Khmer script": "Khmer", + "Korean": "kor", + "Korean (vertical)": "kor_vert", + "Kurdish (Arabic)": "kur_ara", + "Kyrgyz": "kir", + "Lao": "lao", + "Lao script": "Lao", + "Latin": "lat", + "Latin script": "Latin", + "Latvian": "lav", + "Lithuanian": "lit", + "Luxembourgish": "ltz", + "Macedonian": "mkd", + "Malayalam": "mal", + "Malayalam script": "Malayalam", + "Malay": "msa", + "Maltese": "mlt", + "Maori": "mri", + "Marathi": "mar", + "Mongolian": "mon", + "Myanmar script": "Myanmar", + "Nepali": "nep", + "Norwegian": "nor", + "Occitan (post 1500)": "oci", + "Old Georgian": "kat_old", + "Oriya (Odia) script": "Oriya", + "Oriya": "ori", + "Pashto": "pus", + "Persian": "fas", + "Polish": "pol", + "Portuguese": "por", + "Punjabi": "pan", + "Quechua": "que", + "Romanian": "ron", + "Russian": "rus", + "Sanskrit": "san", + "script and orientation": "osd", + "Serbian (Latin)": "srp_latn", + "Serbian": "srp", + "Sindhi": "snd", + "Sinhala script": "Sinhala", + "Sinhala": "sin", + "Slovakian": "slk", + "Slovenian": "slv", + "Spanish, Castilian - Old": "spa_old", + "Spanish": "spa", + "Sundanese": "sun", + "Swahili": "swa", + "Swedish": "swe", + "Syriac script": "Syriac", + "Syriac": "syr", + "Tajik": "tgk", + "Tamil script": "Tamil", + "Tamil": "tam", + "Tatar": "tat", + "Telugu script": "Telugu", + "Telugu": "tel", + "Thaana script": "Thaana", + "Thai script": "Thai", + "Thai": "tha", + "Tibetan script": "Tibetan", + "Tibetan Standard": "bod", + "Tigrinya": "tir", + "Tonga": "ton", + "Turkish": "tur", + "Ukrainian": "ukr", + "Urdu": "urd", + "Uyghur": "uig", + "Uzbek (Cyrillic)": "uzb_cyrl", + "Uzbek": "uzb", + "Vietnamese script": "Vietnamese", + "Vietnamese": "vie", + "Welsh": "cym", + "Yiddish": "yid", + "Yoruba": "yor" +} \ No newline at end of file