tesseract 4.1.1
Loading...
Searching...
No Matches
language_specific.py
Go to the documentation of this file.
1#!/usr/bin/env python3
2# (C) Copyright 2014, Google Inc.
3# (C) Copyright 2018, James R Barlow
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7# http://www.apache.org/licenses/LICENSE-2.0
8# Unless required by applicable law or agreed to in writing, software
9# distributed under the License is distributed on an "AS IS" BASIS,
10# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11# See the License for the specific language governing permissions and
12# limitations under the License.
13#
14# Set some language specific variables. Works in conjunction with
15# tesstrain.sh
16#
17
18# =============================================================================
19# Language specific info
20# =============================================================================
21
22import logging
23import os
24
25log = logging.getLogger(__name__)
26
27# Array of all valid language codes.
28VALID_LANGUAGE_CODES = (
29 "afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat "
30 "ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo "
31 "ell eng enm epo est eus fas fil fin fra frk frm gle glg "
32 "grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old "
33 "jav jav_java jpn kan kat kat_old kaz khm kir kmr kor kur_ara lao lat "
34 "lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori "
35 "pan pol por pus ron rus san sin slk slv snd spa spa_old "
36 "sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur "
37 "uig ukr urd uzb uzb_cyrl vie yid gle_uncial "
38)
39
40# Codes for which we have webtext but no fonts:
41UNUSABLE_LANGUAGE_CODES = ""
42
43FRAKTUR_FONTS = [
44 "CaslonishFraxx Medium",
45 "Cloister Black, Light",
46 "Proclamate Light",
47 "UnifrakturMaguntia",
48 "Walbaum-Fraktur",
49]
50
51# List of fonts to train on
52LATIN_FONTS = [
53 "Arial Bold",
54 "Arial Bold Italic",
55 "Arial Italic",
56 "Arial",
57 "Courier New Bold",
58 "Courier New Bold Italic",
59 "Courier New Italic",
60 "Courier New",
61 "Times New Roman, Bold",
62 "Times New Roman, Bold Italic",
63 "Times New Roman, Italic",
64 "Times New Roman,",
65 "Georgia Bold",
66 "Georgia Italic",
67 "Georgia",
68 "Georgia Bold Italic",
69 "Trebuchet MS Bold",
70 "Trebuchet MS Bold Italic",
71 "Trebuchet MS Italic",
72 "Trebuchet MS",
73 "Verdana Bold",
74 "Verdana Italic",
75 "Verdana",
76 "Verdana Bold Italic",
77 "URW Bookman L Bold",
78 "URW Bookman L Italic",
79 "URW Bookman L Bold Italic",
80 "Century Schoolbook L Bold",
81 "Century Schoolbook L Italic",
82 "Century Schoolbook L Bold Italic",
83 "Century Schoolbook L Medium",
84 "DejaVu Sans Ultra-Light",
85]
86
87# List of fonts for printed/neo-Latin ('lat' language code, different from Latin script)
88NEOLATIN_FONTS = [
89 "GFS Bodoni",
90 "GFS Bodoni Bold",
91 "GFS Bodoni Italic",
92 "GFS Bodoni Bold Italic",
93 "GFS Didot",
94 "GFS Didot Bold",
95 "GFS Didot Italic",
96 "GFS Didot Bold Italic",
97 "Cardo",
98 "Cardo Bold",
99 "Cardo Italic",
100 "Wyld",
101 "Wyld Italic",
102 "EB Garamond",
103 "EB Garamond Italic",
104 "Junicode",
105 "Junicode Bold",
106 "Junicode Italic",
107 "Junicode Bold Italic",
108 "IM FELL DW Pica PRO",
109 "IM FELL English PRO",
110 "IM FELL Double Pica PRO",
111 "IM FELL French Canon PRO",
112 "IM FELL Great Primer PRO",
113 "IM FELL DW Pica PRO Italic",
114 "IM FELL English PRO Italic",
115 "IM FELL Double Pica PRO Italic",
116 "IM FELL French Canon PRO Italic",
117 "IM FELL Great Primer PRO Italic",
118]
119
120IRISH_UNCIAL_FONTS = [
121 "Bunchlo Arsa Dubh GC",
122 "Bunchlo Arsa GC",
123 "Bunchlo Arsa GC Bold",
124 "Bunchlo Dubh GC",
125 "Bunchlo GC",
126 "Bunchlo GC Bold",
127 "Bunchlo Nua GC Bold",
128 "Bunchló na Nod GC",
129 "Gadelica",
130 "Glanchlo Dubh GC",
131 "Glanchlo GC",
132 "Glanchlo GC Bold",
133 "Seanchló Dubh GC",
134 "Seanchló GC",
135 "Seanchló GC Bold",
136 "Seanchló na Nod GC",
137 "Seanchló Ársa Dubh GC",
138 "Seanchló Ársa GC",
139 "Seanchló Ársa GC Bold",
140 "Tromchlo Beag GC",
141 "Tromchlo Mor GC",
142 "Urchlo GC",
143 "Urchlo GC Bold",
144]
145
146EARLY_LATIN_FONTS = [
147 *FRAKTUR_FONTS,
148 *LATIN_FONTS,
149 # The Wyld font family renders early modern ligatures encoded in the private
150 # unicode area.
151 "Wyld",
152 "Wyld Italic",
153 # Fonts that render the Yogh symbol (U+021C, U+021D) found in Old English.
154 "GentiumAlt",
155]
156
157VIETNAMESE_FONTS = [
158 "Arial Unicode MS Bold",
159 "Arial Bold Italic",
160 "Arial Italic",
161 "Arial Unicode MS",
162 "FreeMono Bold",
163 "Courier New Bold Italic",
164 "FreeMono Italic",
165 "FreeMono",
166 "GentiumAlt Italic",
167 "GentiumAlt",
168 "Palatino Linotype Bold",
169 "Palatino Linotype Bold Italic",
170 "Palatino Linotype Italic",
171 "Palatino Linotype",
172 "Really No 2 LT W2G Light",
173 "Really No 2 LT W2G Light Italic",
174 "Really No 2 LT W2G Medium",
175 "Really No 2 LT W2G Medium Italic",
176 "Really No 2 LT W2G Semi-Bold",
177 "Really No 2 LT W2G Semi-Bold Italic",
178 "Really No 2 LT W2G Ultra-Bold",
179 "Really No 2 LT W2G Ultra-Bold Italic",
180 "Times New Roman, Bold",
181 "Times New Roman, Bold Italic",
182 "Times New Roman, Italic",
183 "Times New Roman,",
184 "Verdana Bold",
185 "Verdana Italic",
186 "Verdana",
187 "Verdana Bold Italic",
188 "VL Gothic",
189 "VL PGothic",
190]
191
192DEVANAGARI_FONTS = [
193 "FreeSans",
194 "Chandas",
195 "Kalimati",
196 "Uttara",
197 "Lucida Sans",
198 "gargi Medium",
199 "Lohit Devanagari",
200 "Arial Unicode MS Bold",
201 "Ascender Uni",
202 "Noto Sans Devanagari Bold",
203 "Noto Sans Devanagari",
204 "Samyak Devanagari Medium",
205 "Sarai",
206 "Saral LT Bold",
207 "Saral LT Light",
208 "Nakula",
209 "Sahadeva",
210 "Samanata",
211 "Santipur OT Medium",
212]
213
214KANNADA_FONTS = [
215 "Kedage Bold",
216 "Kedage Italic",
217 "Kedage",
218 "Kedage Bold Italic",
219 "Mallige Bold",
220 "Mallige Italic",
221 "Mallige",
222 "Mallige Bold Italic",
223 "Arial Unicode MS",
224 "Arial Unicode MS Bold",
225 "Ascender Uni",
226 "cheluvi Medium",
227 "Noto Sans Kannada Bold",
228 "Noto Sans Kannada",
229 "Lohit Kannada",
230 "Tunga",
231 "Tunga Bold",
232]
233
234TELUGU_FONTS = [
235 "Pothana2000",
236 "Vemana2000",
237 "Lohit Telugu",
238 "Arial Unicode MS Bold",
239 "Ascender Uni",
240 "Dhurjati",
241 "Gautami Bold",
242 "Gidugu",
243 "Gurajada",
244 "Lakki Reddy",
245 "Mallanna",
246 "Mandali",
247 "NATS",
248 "NTR",
249 "Noto Sans Telugu Bold",
250 "Noto Sans Telugu",
251 "Peddana",
252 "Ponnala",
253 "Ramabhadra",
254 "Ravi Prakash",
255 "Sree Krushnadevaraya",
256 "Suranna",
257 "Suravaram",
258 "Tenali Ramakrishna",
259 "Gautami",
260]
261
262TAMIL_FONTS = [
263 "TAMu_Kadambri",
264 "TAMu_Kalyani",
265 "TAMu_Maduram",
266 "TSCu_Paranar",
267 "TSCu_Times",
268 "TSCu_Paranar Bold",
269 "FreeSans",
270 "FreeSerif",
271 "Lohit Tamil",
272 "Arial Unicode MS Bold",
273 "Ascender Uni",
274 "Droid Sans Tamil Bold",
275 "Droid Sans Tamil",
276 "Karla Tamil Inclined Bold Italic",
277 "Karla Tamil Inclined Italic",
278 "Karla Tamil Upright Bold",
279 "Karla Tamil Upright",
280 "Noto Sans Tamil Bold",
281 "Noto Sans Tamil",
282 "Noto Sans Tamil UI Bold",
283 "Noto Sans Tamil UI",
284 "TSCu_Comic Normal",
285 "Lohit Tamil Classical",
286]
287
288THAI_FONTS = [
289 "FreeSerif",
290 "FreeSerif Italic",
291 "Garuda",
292 "Norasi",
293 "Lucida Sans Typewriter",
294 "Lucida Sans",
295 "Garuda Oblique",
296 "Norasi Oblique",
297 "Norasi Italic",
298 "Garuda Bold",
299 "Norasi Bold",
300 "Lucida Sans Typewriter Bold",
301 "Lucida Sans Semi-Bold",
302 "Garuda Bold Oblique",
303 "Norasi Bold Italic",
304 "Norasi Bold Oblique",
305 "AnuParp LT Thai",
306 "Arial Unicode MS Bold",
307 "Arial Unicode MS",
308 "Ascender Uni",
309 "Loma",
310 "Noto Serif Thai Bold",
311 "Noto Serif Thai",
312 "Purisa Light",
313 "Sirichana LT Bold",
314 "Sirichana LT",
315 "Sukothai LT Bold",
316 "Sukothai LT",
317 "UtSaHaGumm LT Thai",
318 "Tahoma",
319]
320
321KOREAN_FONTS = [
322 "Arial Unicode MS",
323 "Arial Unicode MS Bold",
324 "Baekmuk Batang Patched",
325 "Baekmuk Batang",
326 "Baekmuk Dotum",
327 "Baekmuk Gulim",
328 "Baekmuk Headline",
329]
330
331CHI_SIM_FONTS = [
332 "AR PL UKai CN",
333 "AR PL UMing Patched Light",
334 "Arial Unicode MS",
335 "Arial Unicode MS Bold",
336 "WenQuanYi Zen Hei Medium",
337]
338
339CHI_TRA_FONTS = [
340 "AR PL UKai TW",
341 "AR PL UMing TW MBE Light",
342 "AR PL UKai Patched",
343 "AR PL UMing Patched Light",
344 "Arial Unicode MS",
345 "Arial Unicode MS Bold",
346 "WenQuanYi Zen Hei Medium",
347]
348
349JPN_FONTS = [
350 "TakaoExGothic",
351 "TakaoExMincho",
352 "TakaoGothic",
353 "TakaoMincho",
354 "TakaoPGothic",
355 "TakaoPMincho",
356 "VL Gothic",
357 "VL PGothic",
358 "Noto Sans Japanese Bold",
359 "Noto Sans Japanese Light",
360]
361
362RUSSIAN_FONTS = [
363 "Arial Bold",
364 "Arial Bold Italic",
365 "Arial Italic",
366 "Arial",
367 "Courier New Bold",
368 "Courier New Bold Italic",
369 "Courier New Italic",
370 "Courier New",
371 "Times New Roman, Bold",
372 "Times New Roman, Bold Italic",
373 "Times New Roman, Italic",
374 "Times New Roman,",
375 "Georgia Bold",
376 "Georgia Italic",
377 "Georgia",
378 "Georgia Bold Italic",
379 "Trebuchet MS Bold",
380 "Trebuchet MS Bold Italic",
381 "Trebuchet MS Italic",
382 "Trebuchet MS",
383 "Verdana Bold",
384 "Verdana Italic",
385 "Verdana",
386 "Verdana Bold Italic",
387 "DejaVu Serif",
388 "DejaVu Serif Oblique",
389 "DejaVu Serif Bold",
390 "DejaVu Serif Bold Oblique",
391 "Lucida Bright",
392 "FreeSerif Bold",
393 "FreeSerif Bold Italic",
394 "DejaVu Sans Ultra-Light",
395]
396
397GREEK_FONTS = [
398 "Arial Unicode MS",
399 "Arial Unicode MS Bold",
400 "DejaVu Sans Mono",
401 "DejaVu Sans Mono Oblique",
402 "DejaVu Sans Mono Bold",
403 "DejaVu Sans Mono Bold Oblique",
404 "DejaVu Serif",
405 "DejaVu Serif Semi-Condensed",
406 "DejaVu Serif Oblique",
407 "DejaVu Serif Bold",
408 "DejaVu Serif Bold Oblique",
409 "DejaVu Serif Bold Semi-Condensed",
410 "FreeSerif Bold",
411 "FreeSerif Bold Italic",
412 "FreeSerif Italic",
413 "FreeSerif",
414 "GentiumAlt",
415 "GentiumAlt Italic",
416 "Linux Biolinum O Bold",
417 "Linux Biolinum O",
418 "Linux Libertine O Bold",
419 "Linux Libertine O",
420 "Linux Libertine O Bold Italic",
421 "Linux Libertine O Italic",
422 "Palatino Linotype Bold",
423 "Palatino Linotype Bold Italic",
424 "Palatino Linotype Italic",
425 "Palatino Linotype",
426 "UmePlus P Gothic",
427 "VL PGothic",
428]
429
430ANCIENT_GREEK_FONTS = [
431 "GFS Artemisia",
432 "GFS Artemisia Bold",
433 "GFS Artemisia Bold Italic",
434 "GFS Artemisia Italic",
435 "GFS Bodoni",
436 "GFS Bodoni Bold",
437 "GFS Bodoni Bold Italic",
438 "GFS Bodoni Italic",
439 "GFS Didot",
440 "GFS Didot Bold",
441 "GFS Didot Bold Italic",
442 "GFS Didot Italic",
443 "GFS DidotClassic",
444 "GFS Neohellenic",
445 "GFS Neohellenic Bold",
446 "GFS Neohellenic Bold Italic",
447 "GFS Neohellenic Italic",
448 "GFS Philostratos",
449 "GFS Porson",
450 "GFS Pyrsos",
451 "GFS Solomos",
452]
453
454ARABIC_FONTS = [
455 "Arabic Transparent Bold",
456 "Arabic Transparent",
457 "Arab",
458 "Arial Unicode MS Bold",
459 "Arial Unicode MS",
460 "ASVCodar LT Bold",
461 "ASVCodar LT Light",
462 "Badiya LT Bold",
463 "Badiya LT",
464 "Badr LT Bold",
465 "Badr LT",
466 "Dimnah",
467 "Frutiger LT Arabic Bold",
468 "Frutiger LT Arabic",
469 "Furat",
470 "Hassan LT Bold",
471 "Hassan LT Light",
472 "Jalal LT Bold",
473 "Jalal LT Light",
474 "Midan Bold",
475 "Midan",
476 "Mitra LT Bold",
477 "Mitra LT Light",
478 "Palatino LT Arabic",
479 "Palatino Sans Arabic Bold",
480 "Palatino Sans Arabic",
481 "Simplified Arabic Bold",
482 "Simplified Arabic",
483 "Times New Roman, Bold",
484 "Times New Roman,",
485 "Traditional Arabic Bold",
486 "Traditional Arabic",
487]
488
489HEBREW_FONTS = [
490 "Arial Bold",
491 "Arial Bold Italic",
492 "Arial Italic",
493 "Arial",
494 "Courier New Bold",
495 "Courier New Bold Italic",
496 "Courier New Italic",
497 "Courier New",
498 "Ergo Hebrew Semi-Bold",
499 "Ergo Hebrew Semi-Bold Italic",
500 "Ergo Hebrew",
501 "Ergo Hebrew Italic",
502 "Really No 2 LT W2G Light",
503 "Really No 2 LT W2G Light Italic",
504 "Really No 2 LT W2G Medium",
505 "Really No 2 LT W2G Medium Italic",
506 "Really No 2 LT W2G Semi-Bold",
507 "Really No 2 LT W2G Semi-Bold Italic",
508 "Really No 2 LT W2G Ultra-Bold",
509 "Really No 2 LT W2G Ultra-Bold Italic",
510 "Times New Roman, Bold",
511 "Times New Roman, Bold Italic",
512 "Times New Roman, Italic",
513 "Times New Roman,",
514 "Lucida Sans",
515 "Tahoma",
516]
517
518BENGALI_FONTS = [
519 "Bangla Medium",
520 "Lohit Bengali",
521 "Mukti Narrow",
522 "Mukti Narrow Bold",
523 "Jamrul Medium Semi-Expanded",
524 "Likhan Medium",
525 "Arial Unicode MS Bold",
526 "Ascender Uni",
527 "FreeSans",
528 "FreeSans Oblique",
529 "FreeSerif",
530 "FreeSerif Italic",
531 "Noto Sans Bengali Bold",
532 "Noto Sans Bengali",
533 "Ani",
534 "Lohit Assamese",
535 "Lohit Bengali",
536 "Mitra Mono",
537]
538
539KYRGYZ_FONTS = [
540 "Arial",
541 "Arial Bold",
542 "Arial Italic",
543 "Arial Bold Italic",
544 "Courier New",
545 "Courier New Bold",
546 "Courier New Italic",
547 "Courier New Bold Italic",
548 "Times New Roman,",
549 "Times New Roman, Bold",
550 "Times New Roman, Bold Italic",
551 "Times New Roman, Italic",
552 "DejaVu Serif",
553 "DejaVu Serif Oblique",
554 "DejaVu Serif Bold",
555 "DejaVu Serif Bold Oblique",
556 "Lucida Bright",
557 "FreeSerif Bold",
558 "FreeSerif Bold Italic",
559]
560
561PERSIAN_FONTS = [
562 "Amiri Bold Italic",
563 "Amiri Bold",
564 "Amiri Italic",
565 "Amiri",
566 "Andale Sans Arabic Farsi",
567 "Arial Unicode MS",
568 "Arial Unicode MS Bold",
569 "Lateef",
570 "Lucida Bright",
571 "Lucida Sans Oblique",
572 "Lucida Sans Semi-Bold",
573 "Lucida Sans",
574 "Lucida Sans Typewriter Bold",
575 "Lucida Sans Typewriter Oblique",
576 "Lucida Sans Typewriter",
577 "Scheherazade",
578 "Tahoma",
579 "Times New Roman,",
580 "Times New Roman, Bold",
581 "Times New Roman, Bold Italic",
582 "Times New Roman, Italic",
583 "Yakout Linotype Bold",
584 "Yakout Linotype",
585]
586
587AMHARIC_FONTS = [
588 "Abyssinica SIL",
589 "Droid Sans Ethiopic Bold",
590 "Droid Sans Ethiopic",
591 "FreeSerif",
592 "Noto Sans Ethiopic Bold",
593 "Noto Sans Ethiopic",
594]
595
596ARMENIAN_FONTS = [
597 "Arial Unicode MS",
598 "Arial Unicode MS Bold",
599 "Ascender Uni",
600 "FreeMono",
601 "FreeMono Italic",
602 "FreeSans",
603 "FreeSans Bold",
604 "FreeSans Oblique",
605]
606
607BURMESE_FONTS = [
608 "Myanmar Sans Pro",
609 "Noto Sans Myanmar Bold",
610 "Noto Sans Myanmar",
611 "Padauk Bold",
612 "Padauk",
613 "TharLon",
614]
615
616JAVANESE_FONTS = ["Prada"]
617
618NORTH_AMERICAN_ABORIGINAL_FONTS = [
619 "Aboriginal Sans",
620 "Aboriginal Sans Bold Italic",
621 "Aboriginal Sans Italic",
622 "Aboriginal Sans Bold",
623 "Aboriginal Serif Bold",
624 "Aboriginal Serif Bold Italic",
625 "Aboriginal Serif Italic",
626 "Aboriginal Serif",
627]
628
629GEORGIAN_FONTS = [
630 "Arial Unicode MS Bold",
631 "Arial Unicode MS",
632 "BPG Algeti GPL\&GNU",
633 "BPG Chveulebrivi GPL\&GNU",
634 "BPG Courier GPL\&GNU",
635 "BPG Courier S GPL\&GNU",
636 "BPG DejaVu Sans 2011 GNU-GPL",
637 "BPG Elite GPL\&GNU",
638 "BPG Excelsior GPL\&GNU",
639 "BPG Glaho GPL\&GNU",
640 "BPG Gorda GPL\&GNU",
641 "BPG Ingiri GPL\&GNU",
642 "BPG Mrgvlovani Caps GNU\&GPL",
643 "BPG Mrgvlovani GPL\&GNU",
644 "BPG Nateli Caps GPL\&GNU Light",
645 "BPG Nateli Condenced GPL\&GNU Light",
646 "BPG Nateli GPL\&GNU Light",
647 "BPG Nino Medium Cond GPL\&GNU",
648 "BPG Nino Medium GPL\&GNU Medium",
649 "BPG Sans GPL\&GNU",
650 "BPG Sans Medium GPL\&GNU",
651 "BPG Sans Modern GPL\&GNU",
652 "BPG Sans Regular GPL\&GNU",
653 "BPG Serif GPL\&GNU",
654 "BPG Serif Modern GPL\&GNU",
655 "FreeMono",
656 "FreeMono Bold Italic",
657 "FreeSans",
658 "FreeSerif",
659 "FreeSerif Bold",
660 "FreeSerif Bold Italic",
661 "FreeSerif Italic",
662]
663
664OLD_GEORGIAN_FONTS = [
665 "Arial Unicode MS Bold",
666 "Arial Unicode MS",
667 "BPG Algeti GPL\&GNU",
668 "BPG Courier S GPL\&GNU",
669 "BPG DejaVu Sans 2011 GNU-GPL",
670 "BPG Elite GPL\&GNU",
671 "BPG Excelsior GPL\&GNU",
672 "BPG Glaho GPL\&GNU",
673 "BPG Ingiri GPL\&GNU",
674 "BPG Mrgvlovani Caps GNU\&GPL",
675 "BPG Mrgvlovani GPL\&GNU",
676 "BPG Nateli Caps GPL\&GNU Light",
677 "BPG Nateli Condenced GPL\&GNU Light",
678 "BPG Nateli GPL\&GNU Light",
679 "BPG Nino Medium Cond GPL\&GNU",
680 "BPG Nino Medium GPL\&GNU Medium",
681 "BPG Sans GPL\&GNU",
682 "BPG Sans Medium GPL\&GNU",
683 "BPG Sans Modern GPL\&GNU",
684 "BPG Sans Regular GPL\&GNU",
685 "BPG Serif GPL\&GNU",
686 "BPG Serif Modern GPL\&GNU",
687 "FreeSans",
688 "FreeSerif",
689 "FreeSerif Bold",
690 "FreeSerif Bold Italic",
691 "FreeSerif Italic",
692]
693
694KHMER_FONTS = [
695 "Khmer OS",
696 "Khmer OS System",
697 "Khmer OS Battambang",
698 "Khmer OS Bokor",
699 "Khmer OS Content",
700 "Khmer OS Fasthand",
701 "Khmer OS Freehand",
702 "Khmer OS Metal Chrieng",
703 "Khmer OS Muol Light",
704 "Khmer OS Muol Pali",
705 "Khmer OS Muol",
706 "Khmer OS Siemreap",
707 "Noto Sans Bold",
708 "Noto Sans",
709 "Noto Serif Khmer Bold",
710 "Noto Serif Khmer Light",
711]
712
713KURDISH_FONTS = [
714 "Amiri Bold Italic",
715 "Amiri Bold",
716 "Amiri Italic",
717 "Amiri",
718 "Arial Unicode MS",
719 "Arial Unicode MS Bold",
720 "Lateef",
721 "Lucida Bright",
722 "Lucida Sans Oblique",
723 "Lucida Sans Semi-Bold",
724 "Lucida Sans",
725 "Lucida Sans Typewriter Bold",
726 "Lucida Sans Typewriter Oblique",
727 "Lucida Sans Typewriter",
728 "Scheherazade",
729 "Tahoma",
730 "Times New Roman,",
731 "Times New Roman, Bold",
732 "Times New Roman, Bold Italic",
733 "Times New Roman, Italic",
734 "Unikurd Web",
735 "Yakout Linotype Bold",
736 "Yakout Linotype",
737]
738
739LAOTHIAN_FONTS = [
740 "Phetsarath OT",
741 "Arial Unicode MS",
742 "Arial Unicode MS Bold",
743 "Ascender Uni",
744 "Dhyana Bold",
745 "Dhyana",
746 "Lao Muang Don",
747 "Lao Muang Khong",
748 "Lao Sans Pro",
749 "Noto Sans Lao Bold",
750 "Noto Sans Lao",
751 "Noto Sans Lao UI Bold",
752 "Noto Sans Lao UI",
753 "Noto Serif Lao Bold",
754 "Noto Serif Lao",
755 "Phetsarath Bold",
756 "Phetsarath",
757 "Souliyo Unicode",
758]
759
760GUJARATI_FONTS = [
761 "Lohit Gujarati",
762 "Rekha Medium",
763 "Samyak Gujarati Medium",
764 "aakar Medium",
765 "padmaa Bold",
766 "padmaa Medium",
767 "Arial Unicode MS",
768 "Arial Unicode MS Bold",
769 "Ascender Uni",
770 "FreeSans",
771 "Noto Sans Gujarati Bold",
772 "Noto Sans Gujarati",
773 "Shruti",
774 "Shruti Bold",
775]
776
777MALAYALAM_FONTS = [
778 "AnjaliOldLipi",
779 "Arial Unicode MS",
780 "Arial Unicode MS Bold",
781 "Ascender Uni",
782 "Dyuthi",
783 "FreeSerif",
784 "Kalyani",
785 "Kartika",
786 "Kartika Bold",
787 "Lohit Malayalam",
788 "Meera",
789 "Noto Sans Malayalam Bold",
790 "Noto Sans Malayalam",
791 "Rachana",
792 "Rachana_w01",
793 "RaghuMalayalam",
794 "suruma",
795]
796
797ORIYA_FONTS = [
798 "Arial Unicode MS",
799 "Arial Unicode MS Bold",
800 "Ascender Uni",
801 "ori1Uni Medium",
802 "Samyak Oriya Medium",
803 "Lohit Oriya",
804]
805
806PUNJABI_FONTS = [
807 "Arial Unicode MS",
808 "Arial Unicode MS Bold",
809 "Ascender Uni",
810 "Saab",
811 "Lohit Punjabi",
812 "Noto Sans Gurmukhi",
813 "Noto Sans Gurmukhi Bold",
814 "FreeSans",
815 "FreeSans Bold",
816 "FreeSerif",
817]
818
819SINHALA_FONTS = [
820 "Noto Sans Sinhala Bold",
821 "Noto Sans Sinhala",
822 "OCRUnicode",
823 "Yagpo",
824 "LKLUG",
825 "FreeSerif",
826]
827
828SYRIAC_FONTS = [
829 "East Syriac Adiabene",
830 "East Syriac Ctesiphon",
831 "Estrangelo Antioch",
832 "Estrangelo Edessa",
833 "Estrangelo Midyat",
834 "Estrangelo Nisibin",
835 "Estrangelo Quenneshrin",
836 "Estrangelo Talada",
837 "Estrangelo TurAbdin",
838 "Serto Batnan Bold",
839 "Serto Batnan",
840 "Serto Jerusalem Bold",
841 "Serto Jerusalem Italic",
842 "Serto Jerusalem",
843 "Serto Kharput",
844 "Serto Malankara",
845 "Serto Mardin Bold",
846 "Serto Mardin",
847 "Serto Urhoy Bold",
848 "Serto Urhoy",
849 "FreeSans",
850]
851
852THAANA_FONTS = ["FreeSerif"]
853
854TIBETAN_FONTS = [
855 "Arial Unicode MS",
856 "Arial Unicode MS Bold",
857 "Ascender Uni",
858 "DDC Uchen",
859 "Jomolhari",
860 "Kailasa",
861 "Kokonor",
862 "Tibetan Machine Uni",
863 "TibetanTsugRing",
864 "Yagpo",
865]
866
867# The following fonts will be rendered vertically in phase I.
868VERTICAL_FONTS = [
869 "TakaoExGothic",
870 "TakaoExMincho",
871 "AR PL UKai Patched",
872 "AR PL UMing Patched Light",
873 "Baekmuk Batang Patched",
874]
875
876FLAGS_webtext_prefix = os.environ.get("FLAGS_webtext_prefix", "")
877
878
879# Set language-specific values for several global variables, including
880# ${TEXT_CORPUS}
881# holds the text corpus file for the language, used in phase F
882# ${FONTS[@]}
883# holds a sequence of applicable fonts for the language, used in
884# phase F & I. only set if not already set, i.e. from command line
885# ${TRAINING_DATA_ARGUMENTS}
886# non-default arguments to the training_data program used in phase T
887# ${FILTER_ARGUMENTS}[ -]
888# character-code-specific filtering to distinguish between scripts
889# (eg. CJK) used by filter_borbidden_characters in phase F
890# ${WORDLIST2DAWG_ARGUMENTS}
891# specify fixed length dawg generation for non-space-delimited lang
892# TODO(dsl): We can refactor these into functions that assign FONTS,
893# TEXT_CORPUS, etc. separately.
895 # The default text location is now given directly from the language code.
896 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/{lang}.corpus.txt"
897 FILTER_ARGUMENTS = []
898 WORDLIST2DAWG_ARGUMENTS = ""
899 # These dawg factors represent the fraction of the corpus not covered by the
900 # dawg, and seem like reasonable defaults, but the optimal value is likely
901 # to be highly corpus-dependent, as well as somewhat language-dependent.
902 # Number dawg factor is the fraction of all numeric strings that are not
903 # covered, which is why it is higher relative to the others.
904 PUNC_DAWG_FACTOR = None
905 NUMBER_DAWG_FACTOR = 0.125
906 WORD_DAWG_FACTOR = 0.05
907 BIGRAM_DAWG_FACTOR = 0.015
908 TRAINING_DATA_ARGUMENTS = []
909 FRAGMENTS_DISABLED = "y"
910 RUN_SHAPE_CLUSTERING = False
911 AMBIGS_FILTER_DENOMINATOR = "100000"
912 LEADING = 32
913 MEAN_COUNT = 40 # Default for latin script.
914 # Language to mix with the language for maximum accuracy. Defaults to eng.
915 # If no language is good, set to the base language.
916 MIX_LANG = "eng"
917 FONTS = ctx.fonts
918 TEXT2IMAGE_EXTRA_ARGS = []
919 EXPOSURES = []
920
921 GENERATE_WORD_BIGRAMS = None
922 WORD_DAWG_SIZE = None
923
924 # Latin languages.
925 if lang == "enm":
926 TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported
927 if not FONTS:
928 FONTS = EARLY_LATIN_FONTS
929 elif lang == "frm":
930 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/fra.corpus.txt"
931 # Make long-s substitutions for Middle French text
932 FILTER_ARGUMENTS += ["--make_early_language_variant=fra"]
933 TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported.
934 if not FONTS:
935 FONTS = EARLY_LATIN_FONTS
936 elif lang == "frk":
937 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/deu.corpus.txt"
938 if not FONTS:
939 FONTS = FRAKTUR_FONTS
940 elif lang == "ita_old":
941 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/ita.corpus.txt"
942 # Make long-s substitutions for Early Italian text
943 FILTER_ARGUMENTS += ["--make_early_language_variant=ita"]
944 TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported.
945 if not FONTS:
946 FONTS = EARLY_LATIN_FONTS
947 elif lang == "lat":
948 if not EXPOSURES:
949 EXPOSURES = "-3 -2 -1 0 1 2 3".split()
950 if not FONTS:
951 FONTS = NEOLATIN_FONTS
952 elif lang == "spa_old":
953 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/spa.corpus.txt"
954 # Make long-s substitutions for Early Spanish text
955 FILTER_ARGUMENTS += ["--make_early_language_variant=spa"]
956 TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported.
957 if not FONTS:
958 FONTS = EARLY_LATIN_FONTS
959 elif lang == "srp_latn":
960 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/srp.corpus.txt"
961 elif lang == "vie":
962 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
963 if not FONTS:
964 FONTS = VIETNAMESE_FONTS
965 # Highly inflective languages get a bigger dawg size.
966 # TODO(rays) Add more here!
967 elif lang == "hun":
968 WORD_DAWG_SIZE = 1_000_000
969 elif lang == "pol":
970 WORD_DAWG_SIZE = 1_000_000
971
972 # Latin with default treatment.
973 elif lang == "afr":
974 pass
975 elif lang == "aze":
976 pass
977 elif lang == "bos":
978 pass
979 elif lang == "cat":
980 pass
981 elif lang == "ceb":
982 pass
983 elif lang == "ces":
984 PUNC_DAWG_FACTOR = 0.004
985 elif lang == "cym":
986 pass
987 elif lang == "dan":
988 pass
989 elif lang == "deu":
990 WORD_DAWG_FACTOR = 0.125
991 elif lang == "eng":
992 WORD_DAWG_FACTOR = 0.03
993 elif lang == "epo":
994 pass
995 elif lang == "est":
996 pass
997 elif lang == "eus":
998 pass
999 elif lang == "fil":
1000 pass
1001 elif lang == "fin":
1002 pass
1003 elif lang == "fra":
1004 WORD_DAWG_FACTOR = 0.08
1005 elif lang == "gle":
1006 pass
1007 elif lang == "gle_uncial":
1008 if not FONTS:
1009 FONTS = IRISH_UNCIAL_FONTS
1010 elif lang == "glg":
1011 pass
1012 elif lang == "hat":
1013 pass
1014 elif lang == "hrv":
1015 pass
1016 elif lang == "iast":
1017 pass
1018 elif lang == "ind":
1019 pass
1020 elif lang == "isl":
1021 pass
1022 elif lang == "ita":
1023 pass
1024 elif lang == "jav":
1025 pass
1026 elif lang == "lav":
1027 pass
1028 elif lang == "lit":
1029 pass
1030 elif lang == "mlt":
1031 pass
1032 elif lang == "msa":
1033 pass
1034 elif lang == "nld":
1035 WORD_DAWG_FACTOR = 0.02
1036 elif lang == "nor":
1037 pass
1038 elif lang == "por":
1039 pass
1040 elif lang == "ron":
1041 pass
1042 elif lang == "slk":
1043 pass
1044 elif lang == "slv":
1045 pass
1046 elif lang == "spa":
1047 pass
1048 elif lang == "sqi":
1049 pass
1050 elif lang == "swa":
1051 pass
1052 elif lang == "swe":
1053 pass
1054 elif lang == "tgl":
1055 pass
1056 elif lang == "tur":
1057 pass
1058 elif lang == "uzb":
1059 pass
1060 elif lang == "zlm":
1061 pass
1062
1063 # Special code for performing language-id that is trained on
1064 # EFIGS+Latin+Vietnamese text with regular + fraktur fonts.
1065 elif lang == "lat_lid":
1066 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/lat_lid.corpus.txt"
1067 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1068 GENERATE_WORD_BIGRAMS = 0
1069 # Strip unrenderable words as not all fonts will render the extended
1070 # latin symbols found in Vietnamese text.
1071 WORD_DAWG_SIZE = 1_000_000
1072 if not FONTS:
1073 FONTS = EARLY_LATIN_FONTS
1074
1075 # Cyrillic script-based languages. It is bad to mix Latin with Cyrillic.
1076 elif lang == "rus":
1077 if not FONTS:
1078 FONTS = RUSSIAN_FONTS
1079 MIX_LANG = "rus"
1080 NUMBER_DAWG_FACTOR = 0.05
1081 WORD_DAWG_SIZE = 1_000_000
1082 elif lang in (
1083 "aze_cyrl",
1084 "bel",
1085 "bul",
1086 "kaz",
1087 "mkd",
1088 "srp",
1089 "tgk",
1090 "ukr",
1091 "uzb_cyrl",
1092 ):
1093 MIX_LANG = f"{lang}"
1094 if not FONTS:
1095 FONTS = RUSSIAN_FONTS
1096
1097 # Special code for performing Cyrillic language-id that is trained on
1098 # Russian, Serbian, Ukrainian, Belarusian, Macedonian, Tajik and Mongolian
1099 # text with the list of Russian fonts.
1100 elif lang == "cyr_lid":
1101 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/cyr_lid.corpus.txt"
1102 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1103 GENERATE_WORD_BIGRAMS = 0
1104 WORD_DAWG_SIZE = 1_000_000
1105 if not FONTS:
1106 FONTS = RUSSIAN_FONTS
1107
1108 # South Asian scripts mostly have a lot of different graphemes, so trim
1109 # down the MEAN_COUNT so as not to get a huge amount of text.
1110 elif lang in ("asm", "ben"):
1111 MEAN_COUNT = 15
1112 WORD_DAWG_FACTOR = 0.15
1113 if not FONTS:
1114 FONTS = BENGALI_FONTS
1115 elif lang in ("bih", "hin", "mar", "nep", "san"):
1116 MEAN_COUNT = 15
1117 WORD_DAWG_FACTOR = 0.15
1118 if not FONTS:
1119 FONTS = DEVANAGARI_FONTS
1120 elif lang == "bod":
1121 MEAN_COUNT = 15
1122 WORD_DAWG_FACTOR = 0.15
1123 if not FONTS:
1124 FONTS = TIBETAN_FONTS
1125 elif lang == "dzo":
1126 WORD_DAWG_FACTOR = 0.01
1127 if not FONTS:
1128 FONTS = TIBETAN_FONTS
1129 elif lang == "guj":
1130 MEAN_COUNT = 15
1131 WORD_DAWG_FACTOR = 0.15
1132 if not FONTS:
1133 FONTS = GUJARATI_FONTS
1134 elif lang == "kan":
1135 MEAN_COUNT = 15
1136 WORD_DAWG_FACTOR = 0.15
1137 TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1138 TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1139 if not FONTS:
1140 FONTS = KANNADA_FONTS
1141 elif lang == "mal":
1142 MEAN_COUNT = 15
1143 WORD_DAWG_FACTOR = 0.15
1144 TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1145 TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1146 if not FONTS:
1147 FONTS = MALAYALAM_FONTS
1148 elif lang == "ori":
1149 WORD_DAWG_FACTOR = 0.01
1150 if not FONTS:
1151 FONTS = ORIYA_FONTS
1152 elif lang == "pan":
1153 MEAN_COUNT = 15
1154 WORD_DAWG_FACTOR = 0.01
1155 if not FONTS:
1156 FONTS = PUNJABI_FONTS
1157 elif lang == "sin":
1158 MEAN_COUNT = 15
1159 WORD_DAWG_FACTOR = 0.01
1160 if not FONTS:
1161 FONTS = SINHALA_FONTS
1162 elif lang == "tam":
1163 MEAN_COUNT = 30
1164 WORD_DAWG_FACTOR = 0.15
1165 TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1166 TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1167 if not FONTS:
1168 FONTS = TAMIL_FONTS
1169 elif lang == "tel":
1170 MEAN_COUNT = 15
1171 WORD_DAWG_FACTOR = 0.15
1172 TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1173 TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1174 if not FONTS:
1175 FONTS = TELUGU_FONTS
1176
1177 # SouthEast Asian scripts.
1178 elif lang == "jav_java":
1179 MEAN_COUNT = 15
1180 WORD_DAWG_FACTOR = 0.15
1181 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1182 if not FONTS:
1183 FONTS = JAVANESE_FONTS
1184 elif lang == "khm":
1185 MEAN_COUNT = 15
1186 WORD_DAWG_FACTOR = 0.15
1187 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1188 if not FONTS:
1189 FONTS = KHMER_FONTS
1190 elif lang == "lao":
1191 MEAN_COUNT = 15
1192 WORD_DAWG_FACTOR = 0.15
1193 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1194 if not FONTS:
1195 FONTS = LAOTHIAN_FONTS
1196 elif lang == "mya":
1197 MEAN_COUNT = 12
1198 WORD_DAWG_FACTOR = 0.15
1199 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1200 if not FONTS:
1201 FONTS = BURMESE_FONTS
1202 elif lang == "tha":
1203 MEAN_COUNT = 30
1204 WORD_DAWG_FACTOR = 0.01
1205 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1206 FILTER_ARGUMENTS += ["--segmenter_lang=tha"]
1207 TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1208 AMBIGS_FILTER_DENOMINATOR = "1000"
1209 LEADING = 48
1210 if not FONTS:
1211 FONTS = THAI_FONTS
1212
1213 # CJK
1214 elif lang == "chi_sim":
1215 MEAN_COUNT = 15
1216 PUNC_DAWG_FACTOR = 0.015
1217 WORD_DAWG_FACTOR = 0.015
1218 GENERATE_WORD_BIGRAMS = 0
1219 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1220 TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1221 FILTER_ARGUMENTS += ["--charset_filter=chi_sim", "--segmenter_lang=chi_sim"]
1222 if not FONTS:
1223 FONTS = CHI_SIM_FONTS
1224 elif lang == "chi_tra":
1225 MEAN_COUNT = 15
1226 WORD_DAWG_FACTOR = 0.015
1227 GENERATE_WORD_BIGRAMS = 0
1228 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1229 TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1230 FILTER_ARGUMENTS += ["--charset_filter=chi_tr", "--segmenter_lang=chi_tra"]
1231 if not FONTS:
1232 FONTS = CHI_TRA_FONTS
1233 elif lang == "jpn":
1234 MEAN_COUNT = 15
1235 WORD_DAWG_FACTOR = 0.015
1236 GENERATE_WORD_BIGRAMS = 0
1237 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1238 TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1239 FILTER_ARGUMENTS += ["--charset_filter=jpn", "--segmenter_lang=jpn"]
1240 if not FONTS:
1241 FONTS = JPN_FONTS
1242 elif lang == "kor":
1243 MEAN_COUNT = 20
1244 WORD_DAWG_FACTOR = 0.015
1245 NUMBER_DAWG_FACTOR = 0.05
1246 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1247 TRAINING_DATA_ARGUMENTS += ["--desired_bigrams="]
1248 GENERATE_WORD_BIGRAMS = 0
1249 FILTER_ARGUMENTS += ["--charset_filter=kor", "--segmenter_lang=kor"]
1250 if not FONTS:
1251 FONTS = KOREAN_FONTS
1252
1253 # Middle-Eastern scripts.
1254 elif lang == "ara":
1255 if not FONTS:
1256 FONTS = ARABIC_FONTS
1257 elif lang == "div":
1258 if not FONTS:
1259 FONTS = THAANA_FONTS
1260 elif lang in ("fas", "pus", "snd", "uig", "urd"):
1261 if not FONTS:
1262 FONTS = PERSIAN_FONTS
1263 elif lang in ("heb", "yid"):
1264 NUMBER_DAWG_FACTOR = 0.05
1265 WORD_DAWG_FACTOR = 0.08
1266 if not FONTS:
1267 FONTS = HEBREW_FONTS
1268 elif lang == "syr":
1269 if not FONTS:
1270 FONTS = SYRIAC_FONTS
1271
1272 # Other scripts.
1273 elif lang in ("amh", "tir"):
1274 if not FONTS:
1275 FONTS = AMHARIC_FONTS
1276 elif lang == "chr":
1277 if not FONTS:
1278 FONTS = [*NORTH_AMERICAN_ABORIGINAL_FONTS, "Noto Sans Cherokee"]
1279 elif lang == "ell":
1280 NUMBER_DAWG_FACTOR = 0.05
1281 WORD_DAWG_FACTOR = 0.08
1282 if not FONTS:
1283 FONTS = GREEK_FONTS
1284 elif lang == "grc":
1285 if not EXPOSURES:
1286 EXPOSURES = "-3 -2 -1 0 1 2 3".split()
1287 if not FONTS:
1288 FONTS = ANCIENT_GREEK_FONTS
1289 elif lang == "hye":
1290 if not FONTS:
1291 FONTS = ARMENIAN_FONTS
1292 elif lang == "iku":
1293 if not FONTS:
1294 FONTS = NORTH_AMERICAN_ABORIGINAL_FONTS
1295 elif lang == "kat":
1296 if not FONTS:
1297 FONTS = GEORGIAN_FONTS
1298 elif lang == "kat_old":
1299 TEXT_CORPUS = f"{FLAGS_webtext_prefix}/kat.corpus.txt"
1300 if not FONTS:
1301 FONTS = OLD_GEORGIAN_FONTS
1302 elif lang == "kir":
1303 if not FONTS:
1304 FONTS = KYRGYZ_FONTS
1305 TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=100"]
1306 elif lang == "kmr":
1307 if not FONTS:
1308 FONTS = LATIN_FONTS
1309 elif lang == "kur_ara":
1310 if not FONTS:
1311 FONTS = KURDISH_FONTS
1312 else:
1313 raise ValueError(f"Error: {lang} is not a valid language code")
1314
1315 FLAGS_mean_count = int(os.environ.get("FLAGS_mean_count", -1))
1316 if FLAGS_mean_count > 0:
1317 TRAINING_DATA_ARGUMENTS += [f"--mean_count={FLAGS_mean_count}"]
1318 elif not MEAN_COUNT:
1319 TRAINING_DATA_ARGUMENTS += [f"--mean_count={MEAN_COUNT}"]
1320
1321 # Default to Latin fonts if none have been set
1322 if not FONTS:
1323 FONTS = LATIN_FONTS
1324
1325 # Default to 0 exposure if it hasn't been set
1326 if not EXPOSURES:
1327 EXPOSURES = [0]
1328 # Set right-to-left and normalization mode.
1329 if lang in (
1330 "ara",
1331 "div",
1332 "fas",
1333 "pus",
1334 "snd",
1335 "syr",
1336 "uig",
1337 "urd",
1338 "kur_ara",
1339 "heb",
1340 "yid",
1341 ):
1342 LANG_IS_RTL = True
1343 NORM_MODE = 2
1344 elif lang in (
1345 "asm",
1346 "ben",
1347 "bih",
1348 "hin",
1349 "mar",
1350 "nep",
1351 "guj",
1352 "kan",
1353 "mal",
1354 "tam",
1355 "tel",
1356 "pan",
1357 "dzo",
1358 "sin",
1359 "san",
1360 "bod",
1361 "ori",
1362 "khm",
1363 "mya",
1364 "tha",
1365 "lao",
1366 "jav ",
1367 "jav_java",
1368 ):
1369 LANG_IS_RTL = False
1370 NORM_MODE = 2
1371 else:
1372 LANG_IS_RTL = False
1373 NORM_MODE = 1
1374
1375 vars_to_transfer = {
1376 'ambigs_filter_denominator': AMBIGS_FILTER_DENOMINATOR,
1377 'bigram_dawg_factor': BIGRAM_DAWG_FACTOR,
1378 'exposures': EXPOSURES,
1379 'filter_arguments': FILTER_ARGUMENTS,
1380 'fonts': FONTS,
1381 'fragments_disabled': FRAGMENTS_DISABLED,
1382 'generate_word_bigrams': GENERATE_WORD_BIGRAMS,
1383 'lang_is_rtl': LANG_IS_RTL,
1384 'leading': LEADING,
1385 'mean_count': MEAN_COUNT,
1386 'mix_lang': MIX_LANG,
1387 'norm_mode': NORM_MODE,
1388 'number_dawg_factor': NUMBER_DAWG_FACTOR,
1389 'punc_dawg_factor': PUNC_DAWG_FACTOR,
1390 'run_shape_clustering': RUN_SHAPE_CLUSTERING,
1391 'text2image_extra_args': TEXT2IMAGE_EXTRA_ARGS,
1392 'text_corpus': TEXT_CORPUS,
1393 'training_data_arguments': TRAINING_DATA_ARGUMENTS,
1394 'word_dawg_factor': WORD_DAWG_FACTOR,
1395 'word_dawg_size': WORD_DAWG_SIZE,
1396 'wordlist2dawg_arguments': WORDLIST2DAWG_ARGUMENTS,
1397 }
1398
1399 for attr, value in vars_to_transfer.items():
1400 if hasattr(ctx, attr):
1401 if getattr(ctx, attr) != value:
1402 log.debug(f"{attr} = {value} (was {getattr(ctx, attr)})")
1403 setattr(ctx, attr, value)
1404 else:
1405 log.debug(f"{attr} = {value} (set on cmdline)")
1406 else:
1407 log.debug(f"{attr} = {value}")
1408 setattr(ctx, attr, value)
1409
1410 return ctx
1411
1412# =============================================================================
1413# END of Language specific info
1414# =============================================================================
def set_lang_specific_parameters(ctx, lang)