tesseract 4.1.1
Loading...
Searching...
No Matches
language_specific Namespace Reference

Functions

def set_lang_specific_parameters (ctx, lang)
 

Variables

 log = logging.getLogger(__name__)
 
tuple VALID_LANGUAGE_CODES
 
string UNUSABLE_LANGUAGE_CODES = ""
 
list FRAKTUR_FONTS
 
list LATIN_FONTS
 
list NEOLATIN_FONTS
 
list IRISH_UNCIAL_FONTS
 
list EARLY_LATIN_FONTS
 
list VIETNAMESE_FONTS
 
list DEVANAGARI_FONTS
 
list KANNADA_FONTS
 
list TELUGU_FONTS
 
list TAMIL_FONTS
 
list THAI_FONTS
 
list KOREAN_FONTS
 
list CHI_SIM_FONTS
 
list CHI_TRA_FONTS
 
list JPN_FONTS
 
list RUSSIAN_FONTS
 
list GREEK_FONTS
 
list ANCIENT_GREEK_FONTS
 
list ARABIC_FONTS
 
list HEBREW_FONTS
 
list BENGALI_FONTS
 
list KYRGYZ_FONTS
 
list PERSIAN_FONTS
 
list AMHARIC_FONTS
 
list ARMENIAN_FONTS
 
list BURMESE_FONTS
 
list JAVANESE_FONTS = ["Prada"]
 
list NORTH_AMERICAN_ABORIGINAL_FONTS
 
list GEORGIAN_FONTS
 
list OLD_GEORGIAN_FONTS
 
list KHMER_FONTS
 
list KURDISH_FONTS
 
list LAOTHIAN_FONTS
 
list GUJARATI_FONTS
 
list MALAYALAM_FONTS
 
list ORIYA_FONTS
 
list PUNJABI_FONTS
 
list SINHALA_FONTS
 
list SYRIAC_FONTS
 
list THAANA_FONTS = ["FreeSerif"]
 
list TIBETAN_FONTS
 
list VERTICAL_FONTS
 
 FLAGS_webtext_prefix = os.environ.get("FLAGS_webtext_prefix", "")
 

Function Documentation

◆ set_lang_specific_parameters()

def language_specific.set_lang_specific_parameters (   ctx,
  lang 
)

Definition at line 894 of file language_specific.py.

894def set_lang_specific_parameters(ctx, lang):
895 # The default text location is now given directly from the language code.
896 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/{lang}.corpus.txt"
897 FILTER_ARGUMENTS = []
898 WORDLIST2DAWG_ARGUMENTS = ""
899 # These dawg factors represent the fraction of the corpus not covered by the
900 # dawg, and seem like reasonable defaults, but the optimal value is likely
901 # to be highly corpus-dependent, as well as somewhat language-dependent.
902 # Number dawg factor is the fraction of all numeric strings that are not
903 # covered, which is why it is higher relative to the others.
904 PUNC_DAWG_FACTOR = None
905 NUMBER_DAWG_FACTOR = 0.125
906 WORD_DAWG_FACTOR = 0.05
907 BIGRAM_DAWG_FACTOR = 0.015
908 TRAINING_DATA_ARGUMENTS = []
909 FRAGMENTS_DISABLED = "y"
910 RUN_SHAPE_CLUSTERING = False
911 AMBIGS_FILTER_DENOMINATOR = "100000"
912 LEADING = 32
913 MEAN_COUNT = 40 # Default for latin script.
914 # Language to mix with the language for maximum accuracy. Defaults to eng.
915 # If no language is good, set to the base language.
916 MIX_LANG = "eng"
917 FONTS = ctx.fonts
918 TEXT2IMAGE_EXTRA_ARGS = []
919 EXPOSURES = []
920
921 GENERATE_WORD_BIGRAMS = None
922 WORD_DAWG_SIZE = None
923
924 # Latin languages.
925 if lang == "enm":
926 TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported
927 if not FONTS:
928 FONTS = EARLY_LATIN_FONTS
929 elif lang == "frm":
930 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/fra.corpus.txt"
931 # Make long-s substitutions for Middle French text
932 FILTER_ARGUMENTS += ["--make_early_language_variant=fra"]
933 TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported.
934 if not FONTS:
935 FONTS = EARLY_LATIN_FONTS
936 elif lang == "frk":
937 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/deu.corpus.txt"
938 if not FONTS:
939 FONTS = FRAKTUR_FONTS
940 elif lang == "ita_old":
941 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/ita.corpus.txt"
942 # Make long-s substitutions for Early Italian text
943 FILTER_ARGUMENTS += ["--make_early_language_variant=ita"]
944 TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported.
945 if not FONTS:
946 FONTS = EARLY_LATIN_FONTS
947 elif lang == "lat":
948 if not EXPOSURES:
949 EXPOSURES = "-3 -2 -1 0 1 2 3".split()
950 if not FONTS:
951 FONTS = NEOLATIN_FONTS
952 elif lang == "spa_old":
953 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/spa.corpus.txt"
954 # Make long-s substitutions for Early Spanish text
955 FILTER_ARGUMENTS += ["--make_early_language_variant=spa"]
956 TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported.
957 if not FONTS:
958 FONTS = EARLY_LATIN_FONTS
959 elif lang == "srp_latn":
960 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/srp.corpus.txt"
961 elif lang == "vie":
962 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
963 if not FONTS:
964 FONTS = VIETNAMESE_FONTS
965 # Highly inflective languages get a bigger dawg size.
966 # TODO(rays) Add more here!
967 elif lang == "hun":
968 WORD_DAWG_SIZE = 1_000_000
969 elif lang == "pol":
970 WORD_DAWG_SIZE = 1_000_000
971
972 # Latin with default treatment.
973 elif lang == "afr":
974 pass
975 elif lang == "aze":
976 pass
977 elif lang == "bos":
978 pass
979 elif lang == "cat":
980 pass
981 elif lang == "ceb":
982 pass
983 elif lang == "ces":
984 PUNC_DAWG_FACTOR = 0.004
985 elif lang == "cym":
986 pass
987 elif lang == "dan":
988 pass
989 elif lang == "deu":
990 WORD_DAWG_FACTOR = 0.125
991 elif lang == "eng":
992 WORD_DAWG_FACTOR = 0.03
993 elif lang == "epo":
994 pass
995 elif lang == "est":
996 pass
997 elif lang == "eus":
998 pass
999 elif lang == "fil":
1000 pass
1001 elif lang == "fin":
1002 pass
1003 elif lang == "fra":
1004 WORD_DAWG_FACTOR = 0.08
1005 elif lang == "gle":
1006 pass
1007 elif lang == "gle_uncial":
1008 if not FONTS:
1009 FONTS = IRISH_UNCIAL_FONTS
1010 elif lang == "glg":
1011 pass
1012 elif lang == "hat":
1013 pass
1014 elif lang == "hrv":
1015 pass
1016 elif lang == "iast":
1017 pass
1018 elif lang == "ind":
1019 pass
1020 elif lang == "isl":
1021 pass
1022 elif lang == "ita":
1023 pass
1024 elif lang == "jav":
1025 pass
1026 elif lang == "lav":
1027 pass
1028 elif lang == "lit":
1029 pass
1030 elif lang == "mlt":
1031 pass
1032 elif lang == "msa":
1033 pass
1034 elif lang == "nld":
1035 WORD_DAWG_FACTOR = 0.02
1036 elif lang == "nor":
1037 pass
1038 elif lang == "por":
1039 pass
1040 elif lang == "ron":
1041 pass
1042 elif lang == "slk":
1043 pass
1044 elif lang == "slv":
1045 pass
1046 elif lang == "spa":
1047 pass
1048 elif lang == "sqi":
1049 pass
1050 elif lang == "swa":
1051 pass
1052 elif lang == "swe":
1053 pass
1054 elif lang == "tgl":
1055 pass
1056 elif lang == "tur":
1057 pass
1058 elif lang == "uzb":
1059 pass
1060 elif lang == "zlm":
1061 pass
1062
1063 # Special code for performing language-id that is trained on
1064 # EFIGS+Latin+Vietnamese text with regular + fraktur fonts.
1065 elif lang == "lat_lid":
1066 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/lat_lid.corpus.txt"
1067 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1068 GENERATE_WORD_BIGRAMS = 0
1069 # Strip unrenderable words as not all fonts will render the extended
1070 # latin symbols found in Vietnamese text.
1071 WORD_DAWG_SIZE = 1_000_000
1072 if not FONTS:
1073 FONTS = EARLY_LATIN_FONTS
1074
1075 # Cyrillic script-based languages. It is bad to mix Latin with Cyrillic.
1076 elif lang == "rus":
1077 if not FONTS:
1078 FONTS = RUSSIAN_FONTS
1079 MIX_LANG = "rus"
1080 NUMBER_DAWG_FACTOR = 0.05
1081 WORD_DAWG_SIZE = 1_000_000
1082 elif lang in (
1083 "aze_cyrl",
1084 "bel",
1085 "bul",
1086 "kaz",
1087 "mkd",
1088 "srp",
1089 "tgk",
1090 "ukr",
1091 "uzb_cyrl",
1092 ):
1093 MIX_LANG = f"{lang}"
1094 if not FONTS:
1095 FONTS = RUSSIAN_FONTS
1096
1097 # Special code for performing Cyrillic language-id that is trained on
1098 # Russian, Serbian, Ukrainian, Belarusian, Macedonian, Tajik and Mongolian
1099 # text with the list of Russian fonts.
1100 elif lang == "cyr_lid":
1101 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/cyr_lid.corpus.txt"
1102 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1103 GENERATE_WORD_BIGRAMS = 0
1104 WORD_DAWG_SIZE = 1_000_000
1105 if not FONTS:
1106 FONTS = RUSSIAN_FONTS
1107
1108 # South Asian scripts mostly have a lot of different graphemes, so trim
1109 # down the MEAN_COUNT so as not to get a huge amount of text.
1110 elif lang in ("asm", "ben"):
1111 MEAN_COUNT = 15
1112 WORD_DAWG_FACTOR = 0.15
1113 if not FONTS:
1114 FONTS = BENGALI_FONTS
1115 elif lang in ("bih", "hin", "mar", "nep", "san"):
1116 MEAN_COUNT = 15
1117 WORD_DAWG_FACTOR = 0.15
1118 if not FONTS:
1119 FONTS = DEVANAGARI_FONTS
1120 elif lang == "bod":
1121 MEAN_COUNT = 15
1122 WORD_DAWG_FACTOR = 0.15
1123 if not FONTS:
1124 FONTS = TIBETAN_FONTS
1125 elif lang == "dzo":
1126 WORD_DAWG_FACTOR = 0.01
1127 if not FONTS:
1128 FONTS = TIBETAN_FONTS
1129 elif lang == "guj":
1130 MEAN_COUNT = 15
1131 WORD_DAWG_FACTOR = 0.15
1132 if not FONTS:
1133 FONTS = GUJARATI_FONTS
1134 elif lang == "kan":
1135 MEAN_COUNT = 15
1136 WORD_DAWG_FACTOR = 0.15
1137 TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1138 TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1139 if not FONTS:
1140 FONTS = KANNADA_FONTS
1141 elif lang == "mal":
1142 MEAN_COUNT = 15
1143 WORD_DAWG_FACTOR = 0.15
1144 TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1145 TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1146 if not FONTS:
1147 FONTS = MALAYALAM_FONTS
1148 elif lang == "ori":
1149 WORD_DAWG_FACTOR = 0.01
1150 if not FONTS:
1151 FONTS = ORIYA_FONTS
1152 elif lang == "pan":
1153 MEAN_COUNT = 15
1154 WORD_DAWG_FACTOR = 0.01
1155 if not FONTS:
1156 FONTS = PUNJABI_FONTS
1157 elif lang == "sin":
1158 MEAN_COUNT = 15
1159 WORD_DAWG_FACTOR = 0.01
1160 if not FONTS:
1161 FONTS = SINHALA_FONTS
1162 elif lang == "tam":
1163 MEAN_COUNT = 30
1164 WORD_DAWG_FACTOR = 0.15
1165 TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1166 TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1167 if not FONTS:
1168 FONTS = TAMIL_FONTS
1169 elif lang == "tel":
1170 MEAN_COUNT = 15
1171 WORD_DAWG_FACTOR = 0.15
1172 TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1173 TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1174 if not FONTS:
1175 FONTS = TELUGU_FONTS
1176
1177 # SouthEast Asian scripts.
1178 elif lang == "jav_java":
1179 MEAN_COUNT = 15
1180 WORD_DAWG_FACTOR = 0.15
1181 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1182 if not FONTS:
1183 FONTS = JAVANESE_FONTS
1184 elif lang == "khm":
1185 MEAN_COUNT = 15
1186 WORD_DAWG_FACTOR = 0.15
1187 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1188 if not FONTS:
1189 FONTS = KHMER_FONTS
1190 elif lang == "lao":
1191 MEAN_COUNT = 15
1192 WORD_DAWG_FACTOR = 0.15
1193 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1194 if not FONTS:
1195 FONTS = LAOTHIAN_FONTS
1196 elif lang == "mya":
1197 MEAN_COUNT = 12
1198 WORD_DAWG_FACTOR = 0.15
1199 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1200 if not FONTS:
1201 FONTS = BURMESE_FONTS
1202 elif lang == "tha":
1203 MEAN_COUNT = 30
1204 WORD_DAWG_FACTOR = 0.01
1205 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1206 FILTER_ARGUMENTS += ["--segmenter_lang=tha"]
1207 TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1208 AMBIGS_FILTER_DENOMINATOR = "1000"
1209 LEADING = 48
1210 if not FONTS:
1211 FONTS = THAI_FONTS
1212
1213 # CJK
1214 elif lang == "chi_sim":
1215 MEAN_COUNT = 15
1216 PUNC_DAWG_FACTOR = 0.015
1217 WORD_DAWG_FACTOR = 0.015
1218 GENERATE_WORD_BIGRAMS = 0
1219 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1220 TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1221 FILTER_ARGUMENTS += ["--charset_filter=chi_sim", "--segmenter_lang=chi_sim"]
1222 if not FONTS:
1223 FONTS = CHI_SIM_FONTS
1224 elif lang == "chi_tra":
1225 MEAN_COUNT = 15
1226 WORD_DAWG_FACTOR = 0.015
1227 GENERATE_WORD_BIGRAMS = 0
1228 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1229 TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1230 FILTER_ARGUMENTS += ["--charset_filter=chi_tr", "--segmenter_lang=chi_tra"]
1231 if not FONTS:
1232 FONTS = CHI_TRA_FONTS
1233 elif lang == "jpn":
1234 MEAN_COUNT = 15
1235 WORD_DAWG_FACTOR = 0.015
1236 GENERATE_WORD_BIGRAMS = 0
1237 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1238 TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1239 FILTER_ARGUMENTS += ["--charset_filter=jpn", "--segmenter_lang=jpn"]
1240 if not FONTS:
1241 FONTS = JPN_FONTS
1242 elif lang == "kor":
1243 MEAN_COUNT = 20
1244 WORD_DAWG_FACTOR = 0.015
1245 NUMBER_DAWG_FACTOR = 0.05
1246 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1247 TRAINING_DATA_ARGUMENTS += ["--desired_bigrams="]
1248 GENERATE_WORD_BIGRAMS = 0
1249 FILTER_ARGUMENTS += ["--charset_filter=kor", "--segmenter_lang=kor"]
1250 if not FONTS:
1251 FONTS = KOREAN_FONTS
1252
1253 # Middle-Eastern scripts.
1254 elif lang == "ara":
1255 if not FONTS:
1256 FONTS = ARABIC_FONTS
1257 elif lang == "div":
1258 if not FONTS:
1259 FONTS = THAANA_FONTS
1260 elif lang in ("fas", "pus", "snd", "uig", "urd"):
1261 if not FONTS:
1262 FONTS = PERSIAN_FONTS
1263 elif lang in ("heb", "yid"):
1264 NUMBER_DAWG_FACTOR = 0.05
1265 WORD_DAWG_FACTOR = 0.08
1266 if not FONTS:
1267 FONTS = HEBREW_FONTS
1268 elif lang == "syr":
1269 if not FONTS:
1270 FONTS = SYRIAC_FONTS
1271
1272 # Other scripts.
1273 elif lang in ("amh", "tir"):
1274 if not FONTS:
1275 FONTS = AMHARIC_FONTS
1276 elif lang == "chr":
1277 if not FONTS:
1278 FONTS = [*NORTH_AMERICAN_ABORIGINAL_FONTS, "Noto Sans Cherokee"]
1279 elif lang == "ell":
1280 NUMBER_DAWG_FACTOR = 0.05
1281 WORD_DAWG_FACTOR = 0.08
1282 if not FONTS:
1283 FONTS = GREEK_FONTS
1284 elif lang == "grc":
1285 if not EXPOSURES:
1286 EXPOSURES = "-3 -2 -1 0 1 2 3".split()
1287 if not FONTS:
1288 FONTS = ANCIENT_GREEK_FONTS
1289 elif lang == "hye":
1290 if not FONTS:
1291 FONTS = ARMENIAN_FONTS
1292 elif lang == "iku":
1293 if not FONTS:
1294 FONTS = NORTH_AMERICAN_ABORIGINAL_FONTS
1295 elif lang == "kat":
1296 if not FONTS:
1297 FONTS = GEORGIAN_FONTS
1298 elif lang == "kat_old":
1299 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/kat.corpus.txt"
1300 if not FONTS:
1301 FONTS = OLD_GEORGIAN_FONTS
1302 elif lang == "kir":
1303 if not FONTS:
1304 FONTS = KYRGYZ_FONTS
1305 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=100"]
1306 elif lang == "kmr":
1307 if not FONTS:
1308 FONTS = LATIN_FONTS
1309 elif lang == "kur_ara":
1310 if not FONTS:
1311 FONTS = KURDISH_FONTS
1312 else:
1313 raise ValueError(f"Error: {lang} is not a valid language code")
1314
1315 FLAGS_mean_count = int(os.environ.get("FLAGS_mean_count", -1))
1316 if FLAGS_mean_count > 0:
1317 TRAINING_DATA_ARGUMENTS += [f"--mean_count={FLAGS_mean_count}"]
1318 elif not MEAN_COUNT:
1319 TRAINING_DATA_ARGUMENTS += [f"--mean_count={MEAN_COUNT}"]
1320
1321 # Default to Latin fonts if none have been set
1322 if not FONTS:
1323 FONTS = LATIN_FONTS
1324
1325 # Default to 0 exposure if it hasn't been set
1326 if not EXPOSURES:
1327 EXPOSURES = [0]
1328 # Set right-to-left and normalization mode.
1329 if lang in (
1330 "ara",
1331 "div",
1332 "fas",
1333 "pus",
1334 "snd",
1335 "syr",
1336 "uig",
1337 "urd",
1338 "kur_ara",
1339 "heb",
1340 "yid",
1341 ):
1342 LANG_IS_RTL = True
1343 NORM_MODE = 2
1344 elif lang in (
1345 "asm",
1346 "ben",
1347 "bih",
1348 "hin",
1349 "mar",
1350 "nep",
1351 "guj",
1352 "kan",
1353 "mal",
1354 "tam",
1355 "tel",
1356 "pan",
1357 "dzo",
1358 "sin",
1359 "san",
1360 "bod",
1361 "ori",
1362 "khm",
1363 "mya",
1364 "tha",
1365 "lao",
1366 "jav ",
1367 "jav_java",
1368 ):
1369 LANG_IS_RTL = False
1370 NORM_MODE = 2
1371 else:
1372 LANG_IS_RTL = False
1373 NORM_MODE = 1
1374
1375 vars_to_transfer = {
1376 'ambigs_filter_denominator': AMBIGS_FILTER_DENOMINATOR,
1377 'bigram_dawg_factor': BIGRAM_DAWG_FACTOR,
1378 'exposures': EXPOSURES,
1379 'filter_arguments': FILTER_ARGUMENTS,
1380 'fonts': FONTS,
1381 'fragments_disabled': FRAGMENTS_DISABLED,
1382 'generate_word_bigrams': GENERATE_WORD_BIGRAMS,
1383 'lang_is_rtl': LANG_IS_RTL,
1384 'leading': LEADING,
1385 'mean_count': MEAN_COUNT,
1386 'mix_lang': MIX_LANG,
1387 'norm_mode': NORM_MODE,
1388 'number_dawg_factor': NUMBER_DAWG_FACTOR,
1389 'punc_dawg_factor': PUNC_DAWG_FACTOR,
1390 'run_shape_clustering': RUN_SHAPE_CLUSTERING,
1391 'text2image_extra_args': TEXT2IMAGE_EXTRA_ARGS,
1392 'text_corpus': TEXT_CORPUS,
1393 'training_data_arguments': TRAINING_DATA_ARGUMENTS,
1394 'word_dawg_factor': WORD_DAWG_FACTOR,
1395 'word_dawg_size': WORD_DAWG_SIZE,
1396 'wordlist2dawg_arguments': WORDLIST2DAWG_ARGUMENTS,
1397 }
1398
1399 for attr, value in vars_to_transfer.items():
1400 if hasattr(ctx, attr):
1401 if getattr(ctx, attr) != value:
1402 log.debug(f"{attr} = {value} (was {getattr(ctx, attr)})")
1403 setattr(ctx, attr, value)
1404 else:
1405 log.debug(f"{attr} = {value} (set on cmdline)")
1406 else:
1407 log.debug(f"{attr} = {value}")
1408 setattr(ctx, attr, value)
1409
1410 return ctx
1411
1412# =============================================================================
1413# END of Language specific info
1414# =============================================================================

Variable Documentation

◆ AMHARIC_FONTS

list language_specific.AMHARIC_FONTS
Initial value:
1= [
2 "Abyssinica SIL",
3 "Droid Sans Ethiopic Bold",
4 "Droid Sans Ethiopic",
5 "FreeSerif",
6 "Noto Sans Ethiopic Bold",
7 "Noto Sans Ethiopic",
8]

Definition at line 587 of file language_specific.py.

◆ ANCIENT_GREEK_FONTS

list language_specific.ANCIENT_GREEK_FONTS
Initial value:
1= [
2 "GFS Artemisia",
3 "GFS Artemisia Bold",
4 "GFS Artemisia Bold Italic",
5 "GFS Artemisia Italic",
6 "GFS Bodoni",
7 "GFS Bodoni Bold",
8 "GFS Bodoni Bold Italic",
9 "GFS Bodoni Italic",
10 "GFS Didot",
11 "GFS Didot Bold",
12 "GFS Didot Bold Italic",
13 "GFS Didot Italic",
14 "GFS DidotClassic",
15 "GFS Neohellenic",
16 "GFS Neohellenic Bold",
17 "GFS Neohellenic Bold Italic",
18 "GFS Neohellenic Italic",
19 "GFS Philostratos",
20 "GFS Porson",
21 "GFS Pyrsos",
22 "GFS Solomos",
23]

Definition at line 430 of file language_specific.py.

◆ ARABIC_FONTS

list language_specific.ARABIC_FONTS

Definition at line 454 of file language_specific.py.

◆ ARMENIAN_FONTS

list language_specific.ARMENIAN_FONTS
Initial value:
1= [
2 "Arial Unicode MS",
3 "Arial Unicode MS Bold",
4 "Ascender Uni",
5 "FreeMono",
6 "FreeMono Italic",
7 "FreeSans",
8 "FreeSans Bold",
9 "FreeSans Oblique",
10]

Definition at line 596 of file language_specific.py.

◆ BENGALI_FONTS

list language_specific.BENGALI_FONTS
Initial value:
1= [
2 "Bangla Medium",
3 "Lohit Bengali",
4 "Mukti Narrow",
5 "Mukti Narrow Bold",
6 "Jamrul Medium Semi-Expanded",
7 "Likhan Medium",
8 "Arial Unicode MS Bold",
9 "Ascender Uni",
10 "FreeSans",
11 "FreeSans Oblique",
12 "FreeSerif",
13 "FreeSerif Italic",
14 "Noto Sans Bengali Bold",
15 "Noto Sans Bengali",
16 "Ani",
17 "Lohit Assamese",
18 "Lohit Bengali",
19 "Mitra Mono",
20]

Definition at line 518 of file language_specific.py.

◆ BURMESE_FONTS

list language_specific.BURMESE_FONTS
Initial value:
1= [
2 "Myanmar Sans Pro",
3 "Noto Sans Myanmar Bold",
4 "Noto Sans Myanmar",
5 "Padauk Bold",
6 "Padauk",
7 "TharLon",
8]

Definition at line 607 of file language_specific.py.

◆ CHI_SIM_FONTS

list language_specific.CHI_SIM_FONTS
Initial value:
1= [
2 "AR PL UKai CN",
3 "AR PL UMing Patched Light",
4 "Arial Unicode MS",
5 "Arial Unicode MS Bold",
6 "WenQuanYi Zen Hei Medium",
7]

Definition at line 331 of file language_specific.py.

◆ CHI_TRA_FONTS

list language_specific.CHI_TRA_FONTS
Initial value:
1= [
2 "AR PL UKai TW",
3 "AR PL UMing TW MBE Light",
4 "AR PL UKai Patched",
5 "AR PL UMing Patched Light",
6 "Arial Unicode MS",
7 "Arial Unicode MS Bold",
8 "WenQuanYi Zen Hei Medium",
9]

Definition at line 339 of file language_specific.py.

◆ DEVANAGARI_FONTS

list language_specific.DEVANAGARI_FONTS
Initial value:
1= [
2 "FreeSans",
3 "Chandas",
4 "Kalimati",
5 "Uttara",
6 "Lucida Sans",
7 "gargi Medium",
8 "Lohit Devanagari",
9 "Arial Unicode MS Bold",
10 "Ascender Uni",
11 "Noto Sans Devanagari Bold",
12 "Noto Sans Devanagari",
13 "Samyak Devanagari Medium",
14 "Sarai",
15 "Saral LT Bold",
16 "Saral LT Light",
17 "Nakula",
18 "Sahadeva",
19 "Samanata",
20 "Santipur OT Medium",
21]

Definition at line 192 of file language_specific.py.

◆ EARLY_LATIN_FONTS

list language_specific.EARLY_LATIN_FONTS
Initial value:
1= [
2 *FRAKTUR_FONTS,
3 *LATIN_FONTS,
4 # The Wyld font family renders early modern ligatures encoded in the private
5 # unicode area.
6 "Wyld",
7 "Wyld Italic",
8 # Fonts that render the Yogh symbol (U+021C, U+021D) found in Old English.
9 "GentiumAlt",
10]

Definition at line 146 of file language_specific.py.

◆ FLAGS_webtext_prefix

language_specific.FLAGS_webtext_prefix = os.environ.get("FLAGS_webtext_prefix", "")

Definition at line 876 of file language_specific.py.

◆ FRAKTUR_FONTS

list language_specific.FRAKTUR_FONTS
Initial value:
1= [
2 "CaslonishFraxx Medium",
3 "Cloister Black, Light",
4 "Proclamate Light",
5 "UnifrakturMaguntia",
6 "Walbaum-Fraktur",
7]

Definition at line 43 of file language_specific.py.

◆ GEORGIAN_FONTS

list language_specific.GEORGIAN_FONTS

Definition at line 629 of file language_specific.py.

◆ GREEK_FONTS

list language_specific.GREEK_FONTS

Definition at line 397 of file language_specific.py.

◆ GUJARATI_FONTS

list language_specific.GUJARATI_FONTS
Initial value:
1= [
2 "Lohit Gujarati",
3 "Rekha Medium",
4 "Samyak Gujarati Medium",
5 "aakar Medium",
6 "padmaa Bold",
7 "padmaa Medium",
8 "Arial Unicode MS",
9 "Arial Unicode MS Bold",
10 "Ascender Uni",
11 "FreeSans",
12 "Noto Sans Gujarati Bold",
13 "Noto Sans Gujarati",
14 "Shruti",
15 "Shruti Bold",
16]

Definition at line 760 of file language_specific.py.

◆ HEBREW_FONTS

list language_specific.HEBREW_FONTS
Initial value:
1= [
2 "Arial Bold",
3 "Arial Bold Italic",
4 "Arial Italic",
5 "Arial",
6 "Courier New Bold",
7 "Courier New Bold Italic",
8 "Courier New Italic",
9 "Courier New",
10 "Ergo Hebrew Semi-Bold",
11 "Ergo Hebrew Semi-Bold Italic",
12 "Ergo Hebrew",
13 "Ergo Hebrew Italic",
14 "Really No 2 LT W2G Light",
15 "Really No 2 LT W2G Light Italic",
16 "Really No 2 LT W2G Medium",
17 "Really No 2 LT W2G Medium Italic",
18 "Really No 2 LT W2G Semi-Bold",
19 "Really No 2 LT W2G Semi-Bold Italic",
20 "Really No 2 LT W2G Ultra-Bold",
21 "Really No 2 LT W2G Ultra-Bold Italic",
22 "Times New Roman, Bold",
23 "Times New Roman, Bold Italic",
24 "Times New Roman, Italic",
25 "Times New Roman,",
26 "Lucida Sans",
27 "Tahoma",
28]

Definition at line 489 of file language_specific.py.

◆ IRISH_UNCIAL_FONTS

list language_specific.IRISH_UNCIAL_FONTS
Initial value:
1= [
2 "Bunchlo Arsa Dubh GC",
3 "Bunchlo Arsa GC",
4 "Bunchlo Arsa GC Bold",
5 "Bunchlo Dubh GC",
6 "Bunchlo GC",
7 "Bunchlo GC Bold",
8 "Bunchlo Nua GC Bold",
9 "Bunchló na Nod GC",
10 "Gadelica",
11 "Glanchlo Dubh GC",
12 "Glanchlo GC",
13 "Glanchlo GC Bold",
14 "Seanchló Dubh GC",
15 "Seanchló GC",
16 "Seanchló GC Bold",
17 "Seanchló na Nod GC",
18 "Seanchló Ársa Dubh GC",
19 "Seanchló Ársa GC",
20 "Seanchló Ársa GC Bold",
21 "Tromchlo Beag GC",
22 "Tromchlo Mor GC",
23 "Urchlo GC",
24 "Urchlo GC Bold",
25]

Definition at line 120 of file language_specific.py.

◆ JAVANESE_FONTS

list language_specific.JAVANESE_FONTS = ["Prada"]

Definition at line 616 of file language_specific.py.

◆ JPN_FONTS

list language_specific.JPN_FONTS
Initial value:
1= [
2 "TakaoExGothic",
3 "TakaoExMincho",
4 "TakaoGothic",
5 "TakaoMincho",
6 "TakaoPGothic",
7 "TakaoPMincho",
8 "VL Gothic",
9 "VL PGothic",
10 "Noto Sans Japanese Bold",
11 "Noto Sans Japanese Light",
12]

Definition at line 349 of file language_specific.py.

◆ KANNADA_FONTS

list language_specific.KANNADA_FONTS
Initial value:
1= [
2 "Kedage Bold",
3 "Kedage Italic",
4 "Kedage",
5 "Kedage Bold Italic",
6 "Mallige Bold",
7 "Mallige Italic",
8 "Mallige",
9 "Mallige Bold Italic",
10 "Arial Unicode MS",
11 "Arial Unicode MS Bold",
12 "Ascender Uni",
13 "cheluvi Medium",
14 "Noto Sans Kannada Bold",
15 "Noto Sans Kannada",
16 "Lohit Kannada",
17 "Tunga",
18 "Tunga Bold",
19]

Definition at line 214 of file language_specific.py.

◆ KHMER_FONTS

list language_specific.KHMER_FONTS
Initial value:
1= [
2 "Khmer OS",
3 "Khmer OS System",
4 "Khmer OS Battambang",
5 "Khmer OS Bokor",
6 "Khmer OS Content",
7 "Khmer OS Fasthand",
8 "Khmer OS Freehand",
9 "Khmer OS Metal Chrieng",
10 "Khmer OS Muol Light",
11 "Khmer OS Muol Pali",
12 "Khmer OS Muol",
13 "Khmer OS Siemreap",
14 "Noto Sans Bold",
15 "Noto Sans",
16 "Noto Serif Khmer Bold",
17 "Noto Serif Khmer Light",
18]

Definition at line 694 of file language_specific.py.

◆ KOREAN_FONTS

list language_specific.KOREAN_FONTS
Initial value:
1= [
2 "Arial Unicode MS",
3 "Arial Unicode MS Bold",
4 "Baekmuk Batang Patched",
5 "Baekmuk Batang",
6 "Baekmuk Dotum",
7 "Baekmuk Gulim",
8 "Baekmuk Headline",
9]

Definition at line 321 of file language_specific.py.

◆ KURDISH_FONTS

list language_specific.KURDISH_FONTS
Initial value:
1= [
2 "Amiri Bold Italic",
3 "Amiri Bold",
4 "Amiri Italic",
5 "Amiri",
6 "Arial Unicode MS",
7 "Arial Unicode MS Bold",
8 "Lateef",
9 "Lucida Bright",
10 "Lucida Sans Oblique",
11 "Lucida Sans Semi-Bold",
12 "Lucida Sans",
13 "Lucida Sans Typewriter Bold",
14 "Lucida Sans Typewriter Oblique",
15 "Lucida Sans Typewriter",
16 "Scheherazade",
17 "Tahoma",
18 "Times New Roman,",
19 "Times New Roman, Bold",
20 "Times New Roman, Bold Italic",
21 "Times New Roman, Italic",
22 "Unikurd Web",
23 "Yakout Linotype Bold",
24 "Yakout Linotype",
25]

Definition at line 713 of file language_specific.py.

◆ KYRGYZ_FONTS

list language_specific.KYRGYZ_FONTS
Initial value:
1= [
2 "Arial",
3 "Arial Bold",
4 "Arial Italic",
5 "Arial Bold Italic",
6 "Courier New",
7 "Courier New Bold",
8 "Courier New Italic",
9 "Courier New Bold Italic",
10 "Times New Roman,",
11 "Times New Roman, Bold",
12 "Times New Roman, Bold Italic",
13 "Times New Roman, Italic",
14 "DejaVu Serif",
15 "DejaVu Serif Oblique",
16 "DejaVu Serif Bold",
17 "DejaVu Serif Bold Oblique",
18 "Lucida Bright",
19 "FreeSerif Bold",
20 "FreeSerif Bold Italic",
21]

Definition at line 539 of file language_specific.py.

◆ LAOTHIAN_FONTS

list language_specific.LAOTHIAN_FONTS
Initial value:
1= [
2 "Phetsarath OT",
3 "Arial Unicode MS",
4 "Arial Unicode MS Bold",
5 "Ascender Uni",
6 "Dhyana Bold",
7 "Dhyana",
8 "Lao Muang Don",
9 "Lao Muang Khong",
10 "Lao Sans Pro",
11 "Noto Sans Lao Bold",
12 "Noto Sans Lao",
13 "Noto Sans Lao UI Bold",
14 "Noto Sans Lao UI",
15 "Noto Serif Lao Bold",
16 "Noto Serif Lao",
17 "Phetsarath Bold",
18 "Phetsarath",
19 "Souliyo Unicode",
20]

Definition at line 739 of file language_specific.py.

◆ LATIN_FONTS

list language_specific.LATIN_FONTS

Definition at line 52 of file language_specific.py.

◆ log

language_specific.log = logging.getLogger(__name__)

Definition at line 25 of file language_specific.py.

◆ MALAYALAM_FONTS

list language_specific.MALAYALAM_FONTS
Initial value:
1= [
2 "AnjaliOldLipi",
3 "Arial Unicode MS",
4 "Arial Unicode MS Bold",
5 "Ascender Uni",
6 "Dyuthi",
7 "FreeSerif",
8 "Kalyani",
9 "Kartika",
10 "Kartika Bold",
11 "Lohit Malayalam",
12 "Meera",
13 "Noto Sans Malayalam Bold",
14 "Noto Sans Malayalam",
15 "Rachana",
16 "Rachana_w01",
17 "RaghuMalayalam",
18 "suruma",
19]

Definition at line 777 of file language_specific.py.

◆ NEOLATIN_FONTS

list language_specific.NEOLATIN_FONTS

Definition at line 88 of file language_specific.py.

◆ NORTH_AMERICAN_ABORIGINAL_FONTS

list language_specific.NORTH_AMERICAN_ABORIGINAL_FONTS
Initial value:
1= [
2 "Aboriginal Sans",
3 "Aboriginal Sans Bold Italic",
4 "Aboriginal Sans Italic",
5 "Aboriginal Sans Bold",
6 "Aboriginal Serif Bold",
7 "Aboriginal Serif Bold Italic",
8 "Aboriginal Serif Italic",
9 "Aboriginal Serif",
10]

Definition at line 618 of file language_specific.py.

◆ OLD_GEORGIAN_FONTS

list language_specific.OLD_GEORGIAN_FONTS
Initial value:
1= [
2 "Arial Unicode MS Bold",
3 "Arial Unicode MS",
4 "BPG Algeti GPL\&GNU",
5 "BPG Courier S GPL\&GNU",
6 "BPG DejaVu Sans 2011 GNU-GPL",
7 "BPG Elite GPL\&GNU",
8 "BPG Excelsior GPL\&GNU",
9 "BPG Glaho GPL\&GNU",
10 "BPG Ingiri GPL\&GNU",
11 "BPG Mrgvlovani Caps GNU\&GPL",
12 "BPG Mrgvlovani GPL\&GNU",
13 "BPG Nateli Caps GPL\&GNU Light",
14 "BPG Nateli Condenced GPL\&GNU Light",
15 "BPG Nateli GPL\&GNU Light",
16 "BPG Nino Medium Cond GPL\&GNU",
17 "BPG Nino Medium GPL\&GNU Medium",
18 "BPG Sans GPL\&GNU",
19 "BPG Sans Medium GPL\&GNU",
20 "BPG Sans Modern GPL\&GNU",
21 "BPG Sans Regular GPL\&GNU",
22 "BPG Serif GPL\&GNU",
23 "BPG Serif Modern GPL\&GNU",
24 "FreeSans",
25 "FreeSerif",
26 "FreeSerif Bold",
27 "FreeSerif Bold Italic",
28 "FreeSerif Italic",
29]

Definition at line 664 of file language_specific.py.

◆ ORIYA_FONTS

list language_specific.ORIYA_FONTS
Initial value:
1= [
2 "Arial Unicode MS",
3 "Arial Unicode MS Bold",
4 "Ascender Uni",
5 "ori1Uni Medium",
6 "Samyak Oriya Medium",
7 "Lohit Oriya",
8]

Definition at line 797 of file language_specific.py.

◆ PERSIAN_FONTS

list language_specific.PERSIAN_FONTS
Initial value:
1= [
2 "Amiri Bold Italic",
3 "Amiri Bold",
4 "Amiri Italic",
5 "Amiri",
6 "Andale Sans Arabic Farsi",
7 "Arial Unicode MS",
8 "Arial Unicode MS Bold",
9 "Lateef",
10 "Lucida Bright",
11 "Lucida Sans Oblique",
12 "Lucida Sans Semi-Bold",
13 "Lucida Sans",
14 "Lucida Sans Typewriter Bold",
15 "Lucida Sans Typewriter Oblique",
16 "Lucida Sans Typewriter",
17 "Scheherazade",
18 "Tahoma",
19 "Times New Roman,",
20 "Times New Roman, Bold",
21 "Times New Roman, Bold Italic",
22 "Times New Roman, Italic",
23 "Yakout Linotype Bold",
24 "Yakout Linotype",
25]

Definition at line 561 of file language_specific.py.

◆ PUNJABI_FONTS

list language_specific.PUNJABI_FONTS
Initial value:
1= [
2 "Arial Unicode MS",
3 "Arial Unicode MS Bold",
4 "Ascender Uni",
5 "Saab",
6 "Lohit Punjabi",
7 "Noto Sans Gurmukhi",
8 "Noto Sans Gurmukhi Bold",
9 "FreeSans",
10 "FreeSans Bold",
11 "FreeSerif",
12]

Definition at line 806 of file language_specific.py.

◆ RUSSIAN_FONTS

list language_specific.RUSSIAN_FONTS

Definition at line 362 of file language_specific.py.

◆ SINHALA_FONTS

list language_specific.SINHALA_FONTS
Initial value:
1= [
2 "Noto Sans Sinhala Bold",
3 "Noto Sans Sinhala",
4 "OCRUnicode",
5 "Yagpo",
6 "LKLUG",
7 "FreeSerif",
8]

Definition at line 819 of file language_specific.py.

◆ SYRIAC_FONTS

list language_specific.SYRIAC_FONTS
Initial value:
1= [
2 "East Syriac Adiabene",
3 "East Syriac Ctesiphon",
4 "Estrangelo Antioch",
5 "Estrangelo Edessa",
6 "Estrangelo Midyat",
7 "Estrangelo Nisibin",
8 "Estrangelo Quenneshrin",
9 "Estrangelo Talada",
10 "Estrangelo TurAbdin",
11 "Serto Batnan Bold",
12 "Serto Batnan",
13 "Serto Jerusalem Bold",
14 "Serto Jerusalem Italic",
15 "Serto Jerusalem",
16 "Serto Kharput",
17 "Serto Malankara",
18 "Serto Mardin Bold",
19 "Serto Mardin",
20 "Serto Urhoy Bold",
21 "Serto Urhoy",
22 "FreeSans",
23]

Definition at line 828 of file language_specific.py.

◆ TAMIL_FONTS

list language_specific.TAMIL_FONTS
Initial value:
1= [
2 "TAMu_Kadambri",
3 "TAMu_Kalyani",
4 "TAMu_Maduram",
5 "TSCu_Paranar",
6 "TSCu_Times",
7 "TSCu_Paranar Bold",
8 "FreeSans",
9 "FreeSerif",
10 "Lohit Tamil",
11 "Arial Unicode MS Bold",
12 "Ascender Uni",
13 "Droid Sans Tamil Bold",
14 "Droid Sans Tamil",
15 "Karla Tamil Inclined Bold Italic",
16 "Karla Tamil Inclined Italic",
17 "Karla Tamil Upright Bold",
18 "Karla Tamil Upright",
19 "Noto Sans Tamil Bold",
20 "Noto Sans Tamil",
21 "Noto Sans Tamil UI Bold",
22 "Noto Sans Tamil UI",
23 "TSCu_Comic Normal",
24 "Lohit Tamil Classical",
25]

Definition at line 262 of file language_specific.py.

◆ TELUGU_FONTS

list language_specific.TELUGU_FONTS
Initial value:
1= [
2 "Pothana2000",
3 "Vemana2000",
4 "Lohit Telugu",
5 "Arial Unicode MS Bold",
6 "Ascender Uni",
7 "Dhurjati",
8 "Gautami Bold",
9 "Gidugu",
10 "Gurajada",
11 "Lakki Reddy",
12 "Mallanna",
13 "Mandali",
14 "NATS",
15 "NTR",
16 "Noto Sans Telugu Bold",
17 "Noto Sans Telugu",
18 "Peddana",
19 "Ponnala",
20 "Ramabhadra",
21 "Ravi Prakash",
22 "Sree Krushnadevaraya",
23 "Suranna",
24 "Suravaram",
25 "Tenali Ramakrishna",
26 "Gautami",
27]

Definition at line 234 of file language_specific.py.

◆ THAANA_FONTS

list language_specific.THAANA_FONTS = ["FreeSerif"]

Definition at line 852 of file language_specific.py.

◆ THAI_FONTS

list language_specific.THAI_FONTS

Definition at line 288 of file language_specific.py.

◆ TIBETAN_FONTS

list language_specific.TIBETAN_FONTS
Initial value:
1= [
2 "Arial Unicode MS",
3 "Arial Unicode MS Bold",
4 "Ascender Uni",
5 "DDC Uchen",
6 "Jomolhari",
7 "Kailasa",
8 "Kokonor",
9 "Tibetan Machine Uni",
10 "TibetanTsugRing",
11 "Yagpo",
12]

Definition at line 854 of file language_specific.py.

◆ UNUSABLE_LANGUAGE_CODES

string language_specific.UNUSABLE_LANGUAGE_CODES = ""

Definition at line 41 of file language_specific.py.

◆ VALID_LANGUAGE_CODES

tuple language_specific.VALID_LANGUAGE_CODES
Initial value:
1= (
2 "afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat "
3 "ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo "
4 "ell eng enm epo est eus fas fil fin fra frk frm gle glg "
5 "grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old "
6 "jav jav_java jpn kan kat kat_old kaz khm kir kmr kor kur_ara lao lat "
7 "lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori "
8 "pan pol por pus ron rus san sin slk slv snd spa spa_old "
9 "sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur "
10 "uig ukr urd uzb uzb_cyrl vie yid gle_uncial "
11)

Definition at line 28 of file language_specific.py.

◆ VERTICAL_FONTS

list language_specific.VERTICAL_FONTS
Initial value:
1= [
2 "TakaoExGothic",
3 "TakaoExMincho",
4 "AR PL UKai Patched",
5 "AR PL UMing Patched Light",
6 "Baekmuk Batang Patched",
7]

Definition at line 868 of file language_specific.py.

◆ VIETNAMESE_FONTS

list language_specific.VIETNAMESE_FONTS

Definition at line 157 of file language_specific.py.