894def set_lang_specific_parameters(ctx, lang):
895
896 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/{lang}.corpus.txt"
897 FILTER_ARGUMENTS = []
898 WORDLIST2DAWG_ARGUMENTS = ""
899
900
901
902
903
904 PUNC_DAWG_FACTOR = None
905 NUMBER_DAWG_FACTOR = 0.125
906 WORD_DAWG_FACTOR = 0.05
907 BIGRAM_DAWG_FACTOR = 0.015
908 TRAINING_DATA_ARGUMENTS = []
909 FRAGMENTS_DISABLED = "y"
910 RUN_SHAPE_CLUSTERING = False
911 AMBIGS_FILTER_DENOMINATOR = "100000"
912 LEADING = 32
913 MEAN_COUNT = 40
914
915
916 MIX_LANG = "eng"
917 FONTS = ctx.fonts
918 TEXT2IMAGE_EXTRA_ARGS = []
919 EXPOSURES = []
920
921 GENERATE_WORD_BIGRAMS = None
922 WORD_DAWG_SIZE = None
923
924
925 if lang == "enm":
926 TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"]
927 if not FONTS:
928 FONTS = EARLY_LATIN_FONTS
929 elif lang == "frm":
930 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/fra.corpus.txt"
931
932 FILTER_ARGUMENTS += ["--make_early_language_variant=fra"]
933 TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"]
934 if not FONTS:
935 FONTS = EARLY_LATIN_FONTS
936 elif lang == "frk":
937 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/deu.corpus.txt"
938 if not FONTS:
939 FONTS = FRAKTUR_FONTS
940 elif lang == "ita_old":
941 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/ita.corpus.txt"
942
943 FILTER_ARGUMENTS += ["--make_early_language_variant=ita"]
944 TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"]
945 if not FONTS:
946 FONTS = EARLY_LATIN_FONTS
947 elif lang == "lat":
948 if not EXPOSURES:
949 EXPOSURES = "-3 -2 -1 0 1 2 3".split()
950 if not FONTS:
951 FONTS = NEOLATIN_FONTS
952 elif lang == "spa_old":
953 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/spa.corpus.txt"
954
955 FILTER_ARGUMENTS += ["--make_early_language_variant=spa"]
956 TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"]
957 if not FONTS:
958 FONTS = EARLY_LATIN_FONTS
959 elif lang == "srp_latn":
960 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/srp.corpus.txt"
961 elif lang == "vie":
962 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
963 if not FONTS:
964 FONTS = VIETNAMESE_FONTS
965
966
967 elif lang == "hun":
968 WORD_DAWG_SIZE = 1_000_000
969 elif lang == "pol":
970 WORD_DAWG_SIZE = 1_000_000
971
972
973 elif lang == "afr":
974 pass
975 elif lang == "aze":
976 pass
977 elif lang == "bos":
978 pass
979 elif lang == "cat":
980 pass
981 elif lang == "ceb":
982 pass
983 elif lang == "ces":
984 PUNC_DAWG_FACTOR = 0.004
985 elif lang == "cym":
986 pass
987 elif lang == "dan":
988 pass
989 elif lang == "deu":
990 WORD_DAWG_FACTOR = 0.125
991 elif lang == "eng":
992 WORD_DAWG_FACTOR = 0.03
993 elif lang == "epo":
994 pass
995 elif lang == "est":
996 pass
997 elif lang == "eus":
998 pass
999 elif lang == "fil":
1000 pass
1001 elif lang == "fin":
1002 pass
1003 elif lang == "fra":
1004 WORD_DAWG_FACTOR = 0.08
1005 elif lang == "gle":
1006 pass
1007 elif lang == "gle_uncial":
1008 if not FONTS:
1009 FONTS = IRISH_UNCIAL_FONTS
1010 elif lang == "glg":
1011 pass
1012 elif lang == "hat":
1013 pass
1014 elif lang == "hrv":
1015 pass
1016 elif lang == "iast":
1017 pass
1018 elif lang == "ind":
1019 pass
1020 elif lang == "isl":
1021 pass
1022 elif lang == "ita":
1023 pass
1024 elif lang == "jav":
1025 pass
1026 elif lang == "lav":
1027 pass
1028 elif lang == "lit":
1029 pass
1030 elif lang == "mlt":
1031 pass
1032 elif lang == "msa":
1033 pass
1034 elif lang == "nld":
1035 WORD_DAWG_FACTOR = 0.02
1036 elif lang == "nor":
1037 pass
1038 elif lang == "por":
1039 pass
1040 elif lang == "ron":
1041 pass
1042 elif lang == "slk":
1043 pass
1044 elif lang == "slv":
1045 pass
1046 elif lang == "spa":
1047 pass
1048 elif lang == "sqi":
1049 pass
1050 elif lang == "swa":
1051 pass
1052 elif lang == "swe":
1053 pass
1054 elif lang == "tgl":
1055 pass
1056 elif lang == "tur":
1057 pass
1058 elif lang == "uzb":
1059 pass
1060 elif lang == "zlm":
1061 pass
1062
1063
1064
1065 elif lang == "lat_lid":
1066 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/lat_lid.corpus.txt"
1067 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1068 GENERATE_WORD_BIGRAMS = 0
1069
1070
1071 WORD_DAWG_SIZE = 1_000_000
1072 if not FONTS:
1073 FONTS = EARLY_LATIN_FONTS
1074
1075
1076 elif lang == "rus":
1077 if not FONTS:
1078 FONTS = RUSSIAN_FONTS
1079 MIX_LANG = "rus"
1080 NUMBER_DAWG_FACTOR = 0.05
1081 WORD_DAWG_SIZE = 1_000_000
1082 elif lang in (
1083 "aze_cyrl",
1084 "bel",
1085 "bul",
1086 "kaz",
1087 "mkd",
1088 "srp",
1089 "tgk",
1090 "ukr",
1091 "uzb_cyrl",
1092 ):
1093 MIX_LANG = f"{lang}"
1094 if not FONTS:
1095 FONTS = RUSSIAN_FONTS
1096
1097
1098
1099
1100 elif lang == "cyr_lid":
1101 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/cyr_lid.corpus.txt"
1102 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1103 GENERATE_WORD_BIGRAMS = 0
1104 WORD_DAWG_SIZE = 1_000_000
1105 if not FONTS:
1106 FONTS = RUSSIAN_FONTS
1107
1108
1109
1110 elif lang in ("asm", "ben"):
1111 MEAN_COUNT = 15
1112 WORD_DAWG_FACTOR = 0.15
1113 if not FONTS:
1114 FONTS = BENGALI_FONTS
1115 elif lang in ("bih", "hin", "mar", "nep", "san"):
1116 MEAN_COUNT = 15
1117 WORD_DAWG_FACTOR = 0.15
1118 if not FONTS:
1119 FONTS = DEVANAGARI_FONTS
1120 elif lang == "bod":
1121 MEAN_COUNT = 15
1122 WORD_DAWG_FACTOR = 0.15
1123 if not FONTS:
1124 FONTS = TIBETAN_FONTS
1125 elif lang == "dzo":
1126 WORD_DAWG_FACTOR = 0.01
1127 if not FONTS:
1128 FONTS = TIBETAN_FONTS
1129 elif lang == "guj":
1130 MEAN_COUNT = 15
1131 WORD_DAWG_FACTOR = 0.15
1132 if not FONTS:
1133 FONTS = GUJARATI_FONTS
1134 elif lang == "kan":
1135 MEAN_COUNT = 15
1136 WORD_DAWG_FACTOR = 0.15
1137 TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1138 TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1139 if not FONTS:
1140 FONTS = KANNADA_FONTS
1141 elif lang == "mal":
1142 MEAN_COUNT = 15
1143 WORD_DAWG_FACTOR = 0.15
1144 TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1145 TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1146 if not FONTS:
1147 FONTS = MALAYALAM_FONTS
1148 elif lang == "ori":
1149 WORD_DAWG_FACTOR = 0.01
1150 if not FONTS:
1151 FONTS = ORIYA_FONTS
1152 elif lang == "pan":
1153 MEAN_COUNT = 15
1154 WORD_DAWG_FACTOR = 0.01
1155 if not FONTS:
1156 FONTS = PUNJABI_FONTS
1157 elif lang == "sin":
1158 MEAN_COUNT = 15
1159 WORD_DAWG_FACTOR = 0.01
1160 if not FONTS:
1161 FONTS = SINHALA_FONTS
1162 elif lang == "tam":
1163 MEAN_COUNT = 30
1164 WORD_DAWG_FACTOR = 0.15
1165 TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1166 TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1167 if not FONTS:
1168 FONTS = TAMIL_FONTS
1169 elif lang == "tel":
1170 MEAN_COUNT = 15
1171 WORD_DAWG_FACTOR = 0.15
1172 TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1173 TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1174 if not FONTS:
1175 FONTS = TELUGU_FONTS
1176
1177
1178 elif lang == "jav_java":
1179 MEAN_COUNT = 15
1180 WORD_DAWG_FACTOR = 0.15
1181 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1182 if not FONTS:
1183 FONTS = JAVANESE_FONTS
1184 elif lang == "khm":
1185 MEAN_COUNT = 15
1186 WORD_DAWG_FACTOR = 0.15
1187 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1188 if not FONTS:
1189 FONTS = KHMER_FONTS
1190 elif lang == "lao":
1191 MEAN_COUNT = 15
1192 WORD_DAWG_FACTOR = 0.15
1193 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1194 if not FONTS:
1195 FONTS = LAOTHIAN_FONTS
1196 elif lang == "mya":
1197 MEAN_COUNT = 12
1198 WORD_DAWG_FACTOR = 0.15
1199 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1200 if not FONTS:
1201 FONTS = BURMESE_FONTS
1202 elif lang == "tha":
1203 MEAN_COUNT = 30
1204 WORD_DAWG_FACTOR = 0.01
1205 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1206 FILTER_ARGUMENTS += ["--segmenter_lang=tha"]
1207 TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1208 AMBIGS_FILTER_DENOMINATOR = "1000"
1209 LEADING = 48
1210 if not FONTS:
1211 FONTS = THAI_FONTS
1212
1213
1214 elif lang == "chi_sim":
1215 MEAN_COUNT = 15
1216 PUNC_DAWG_FACTOR = 0.015
1217 WORD_DAWG_FACTOR = 0.015
1218 GENERATE_WORD_BIGRAMS = 0
1219 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1220 TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1221 FILTER_ARGUMENTS += ["--charset_filter=chi_sim", "--segmenter_lang=chi_sim"]
1222 if not FONTS:
1223 FONTS = CHI_SIM_FONTS
1224 elif lang == "chi_tra":
1225 MEAN_COUNT = 15
1226 WORD_DAWG_FACTOR = 0.015
1227 GENERATE_WORD_BIGRAMS = 0
1228 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1229 TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1230 FILTER_ARGUMENTS += ["--charset_filter=chi_tr", "--segmenter_lang=chi_tra"]
1231 if not FONTS:
1232 FONTS = CHI_TRA_FONTS
1233 elif lang == "jpn":
1234 MEAN_COUNT = 15
1235 WORD_DAWG_FACTOR = 0.015
1236 GENERATE_WORD_BIGRAMS = 0
1237 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1238 TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1239 FILTER_ARGUMENTS += ["--charset_filter=jpn", "--segmenter_lang=jpn"]
1240 if not FONTS:
1241 FONTS = JPN_FONTS
1242 elif lang == "kor":
1243 MEAN_COUNT = 20
1244 WORD_DAWG_FACTOR = 0.015
1245 NUMBER_DAWG_FACTOR = 0.05
1246 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1247 TRAINING_DATA_ARGUMENTS += ["--desired_bigrams="]
1248 GENERATE_WORD_BIGRAMS = 0
1249 FILTER_ARGUMENTS += ["--charset_filter=kor", "--segmenter_lang=kor"]
1250 if not FONTS:
1251 FONTS = KOREAN_FONTS
1252
1253
1254 elif lang == "ara":
1255 if not FONTS:
1256 FONTS = ARABIC_FONTS
1257 elif lang == "div":
1258 if not FONTS:
1259 FONTS = THAANA_FONTS
1260 elif lang in ("fas", "pus", "snd", "uig", "urd"):
1261 if not FONTS:
1262 FONTS = PERSIAN_FONTS
1263 elif lang in ("heb", "yid"):
1264 NUMBER_DAWG_FACTOR = 0.05
1265 WORD_DAWG_FACTOR = 0.08
1266 if not FONTS:
1267 FONTS = HEBREW_FONTS
1268 elif lang == "syr":
1269 if not FONTS:
1270 FONTS = SYRIAC_FONTS
1271
1272
1273 elif lang in ("amh", "tir"):
1274 if not FONTS:
1275 FONTS = AMHARIC_FONTS
1276 elif lang == "chr":
1277 if not FONTS:
1278 FONTS = [*NORTH_AMERICAN_ABORIGINAL_FONTS, "Noto Sans Cherokee"]
1279 elif lang == "ell":
1280 NUMBER_DAWG_FACTOR = 0.05
1281 WORD_DAWG_FACTOR = 0.08
1282 if not FONTS:
1283 FONTS = GREEK_FONTS
1284 elif lang == "grc":
1285 if not EXPOSURES:
1286 EXPOSURES = "-3 -2 -1 0 1 2 3".split()
1287 if not FONTS:
1288 FONTS = ANCIENT_GREEK_FONTS
1289 elif lang == "hye":
1290 if not FONTS:
1291 FONTS = ARMENIAN_FONTS
1292 elif lang == "iku":
1293 if not FONTS:
1294 FONTS = NORTH_AMERICAN_ABORIGINAL_FONTS
1295 elif lang == "kat":
1296 if not FONTS:
1297 FONTS = GEORGIAN_FONTS
1298 elif lang == "kat_old":
1299 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/kat.corpus.txt"
1300 if not FONTS:
1301 FONTS = OLD_GEORGIAN_FONTS
1302 elif lang == "kir":
1303 if not FONTS:
1304 FONTS = KYRGYZ_FONTS
1305 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=100"]
1306 elif lang == "kmr":
1307 if not FONTS:
1308 FONTS = LATIN_FONTS
1309 elif lang == "kur_ara":
1310 if not FONTS:
1311 FONTS = KURDISH_FONTS
1312 else:
1313 raise ValueError(f"Error: {lang} is not a valid language code")
1314
1315 FLAGS_mean_count = int(os.environ.get("FLAGS_mean_count", -1))
1316 if FLAGS_mean_count > 0:
1317 TRAINING_DATA_ARGUMENTS += [f"--mean_count={FLAGS_mean_count}"]
1318 elif not MEAN_COUNT:
1319 TRAINING_DATA_ARGUMENTS += [f"--mean_count={MEAN_COUNT}"]
1320
1321
1322 if not FONTS:
1323 FONTS = LATIN_FONTS
1324
1325
1326 if not EXPOSURES:
1327 EXPOSURES = [0]
1328
1329 if lang in (
1330 "ara",
1331 "div",
1332 "fas",
1333 "pus",
1334 "snd",
1335 "syr",
1336 "uig",
1337 "urd",
1338 "kur_ara",
1339 "heb",
1340 "yid",
1341 ):
1342 LANG_IS_RTL = True
1343 NORM_MODE = 2
1344 elif lang in (
1345 "asm",
1346 "ben",
1347 "bih",
1348 "hin",
1349 "mar",
1350 "nep",
1351 "guj",
1352 "kan",
1353 "mal",
1354 "tam",
1355 "tel",
1356 "pan",
1357 "dzo",
1358 "sin",
1359 "san",
1360 "bod",
1361 "ori",
1362 "khm",
1363 "mya",
1364 "tha",
1365 "lao",
1366 "jav ",
1367 "jav_java",
1368 ):
1369 LANG_IS_RTL = False
1370 NORM_MODE = 2
1371 else:
1372 LANG_IS_RTL = False
1373 NORM_MODE = 1
1374
1375 vars_to_transfer = {
1376 'ambigs_filter_denominator': AMBIGS_FILTER_DENOMINATOR,
1377 'bigram_dawg_factor': BIGRAM_DAWG_FACTOR,
1378 'exposures': EXPOSURES,
1379 'filter_arguments': FILTER_ARGUMENTS,
1380 'fonts': FONTS,
1381 'fragments_disabled': FRAGMENTS_DISABLED,
1382 'generate_word_bigrams': GENERATE_WORD_BIGRAMS,
1383 'lang_is_rtl': LANG_IS_RTL,
1384 'leading': LEADING,
1385 'mean_count': MEAN_COUNT,
1386 'mix_lang': MIX_LANG,
1387 'norm_mode': NORM_MODE,
1388 'number_dawg_factor': NUMBER_DAWG_FACTOR,
1389 'punc_dawg_factor': PUNC_DAWG_FACTOR,
1390 'run_shape_clustering': RUN_SHAPE_CLUSTERING,
1391 'text2image_extra_args': TEXT2IMAGE_EXTRA_ARGS,
1392 'text_corpus': TEXT_CORPUS,
1393 'training_data_arguments': TRAINING_DATA_ARGUMENTS,
1394 'word_dawg_factor': WORD_DAWG_FACTOR,
1395 'word_dawg_size': WORD_DAWG_SIZE,
1396 'wordlist2dawg_arguments': WORDLIST2DAWG_ARGUMENTS,
1397 }
1398
1399 for attr, value in vars_to_transfer.items():
1400 if hasattr(ctx, attr):
1401 if getattr(ctx, attr) != value:
1402 log.debug(f"{attr} = {value} (was {getattr(ctx, attr)})")
1403 setattr(ctx, attr, value)
1404 else:
1405 log.debug(f"{attr} = {value} (set on cmdline)")
1406 else:
1407 log.debug(f"{attr} = {value}")
1408 setattr(ctx, attr, value)
1409
1410 return ctx
1411
1412
1413
1414