706a051a42
(and pylint warning fixes to the gen_translit_* scripts by Pravin Satpute)
5406 lines
200 KiB
Diff
5406 lines
200 KiB
Diff
From ee7a6e89183bdc8453dd9a0dadf133e65deb9e0b Mon Sep 17 00:00:00 2001
|
||
From: Mike FABIAN <mfabian@redhat.com>
|
||
Date: Fri, 17 Apr 2015 09:12:05 +0200
|
||
Subject: [PATCH 3/5] Update the translit files to Unicode 7.0.0
|
||
|
||
for localedata/ChangeLog
|
||
|
||
[BZ #16061]
|
||
* unicode_utils.py: New.
|
||
* existing scripts changed to used unicode_utils.py
|
||
* gen_translit_circle.py: New
|
||
* gen_translit_cjk_compat.py: New
|
||
* gen_translit_combining.py: New
|
||
* gen_translit_compat.py: New
|
||
* gen_translit_font.py: New
|
||
* gen_translit_fraction.py: New
|
||
* locales/translit_circle: Update.
|
||
* locales/translit_cjk_compat: Update.
|
||
* locales/translit_combining: Update.
|
||
* locales/translit_compat: Update.
|
||
* locales/translit_font: Update.
|
||
* locales/translit_fraction: Update.
|
||
---
|
||
localedata/locales/translit_circle | 30 +-
|
||
localedata/locales/translit_cjk_compat | 422 +++++++++++++-
|
||
localedata/locales/translit_combining | 636 +++++++++++++++++++++-
|
||
localedata/locales/translit_compat | 578 +++++++++++++++++++-
|
||
localedata/locales/translit_font | 151 ++++-
|
||
localedata/locales/translit_fraction | 15 +-
|
||
localedata/unicode-gen/Makefile | 42 +-
|
||
localedata/unicode-gen/gen_translit_circle.py | 150 +++++
|
||
localedata/unicode-gen/gen_translit_cjk_compat.py | 220 ++++++++
|
||
localedata/unicode-gen/gen_translit_combining.py | 442 +++++++++++++++
|
||
localedata/unicode-gen/gen_translit_compat.py | 326 +++++++++++
|
||
localedata/unicode-gen/gen_translit_font.py | 156 ++++++
|
||
localedata/unicode-gen/gen_translit_fraction.py | 197 +++++++
|
||
localedata/unicode-gen/gen_unicode_ctype.py | 497 +----------------
|
||
localedata/unicode-gen/unicode_utils.py | 502 +++++++++++++++++
|
||
localedata/unicode-gen/utf8_compatibility.py | 217 ++------
|
||
localedata/unicode-gen/utf8_gen.py | 28 +-
|
||
17 files changed, 3896 insertions(+), 713 deletions(-)
|
||
create mode 100755 localedata/unicode-gen/gen_translit_circle.py
|
||
create mode 100755 localedata/unicode-gen/gen_translit_cjk_compat.py
|
||
create mode 100755 localedata/unicode-gen/gen_translit_combining.py
|
||
create mode 100755 localedata/unicode-gen/gen_translit_compat.py
|
||
create mode 100755 localedata/unicode-gen/gen_translit_font.py
|
||
create mode 100755 localedata/unicode-gen/gen_translit_fraction.py
|
||
create mode 100644 localedata/unicode-gen/unicode_utils.py
|
||
|
||
diff --git a/localedata/locales/translit_circle b/localedata/locales/translit_circle
|
||
index f701bc9..5d5f58c 100644
|
||
--- a/localedata/locales/translit_circle
|
||
+++ b/localedata/locales/translit_circle
|
||
@@ -2,9 +2,7 @@ escape_char /
|
||
comment_char %
|
||
|
||
% Transliterations of encircled characters.
|
||
-% Generated through
|
||
-% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;<circle>[^;]*;' UnicodeData.txt | \
|
||
-% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;<circle> \([^;]*\);.*$/<U\1> "<U0028 \3 0029>"% \2/' -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1><U\2/g' -e 'x' -e 's/^[^%]*\(% .*\)$/\1/' -e 'G'
|
||
+% Generated automatically from UnicodeData.txt by gen_translit_circle.py on 2015-06-10 for Unicode 7.0.0.
|
||
|
||
LC_CTYPE
|
||
|
||
@@ -156,6 +154,14 @@ translit_start
|
||
<U24E9> "<U0028><U007A><U0029>"
|
||
% CIRCLED DIGIT ZERO
|
||
<U24EA> "<U0028><U0030><U0029>"
|
||
+% CIRCLED IDEOGRAPH QUESTION
|
||
+<U3244> "<U0028><U554F><U0029>"
|
||
+% CIRCLED IDEOGRAPH KINDERGARTEN
|
||
+<U3245> "<U0028><U5E7C><U0029>"
|
||
+% CIRCLED IDEOGRAPH SCHOOL
|
||
+<U3246> "<U0028><U6587><U0029>"
|
||
+% CIRCLED IDEOGRAPH KOTO
|
||
+<U3247> "<U0028><U7B8F><U0029>"
|
||
% CIRCLED NUMBER TWENTY ONE
|
||
<U3251> "<U0028><U0032><U0031><U0029>"
|
||
% CIRCLED NUMBER TWENTY TWO
|
||
@@ -242,6 +248,12 @@ translit_start
|
||
<U327A> "<U0028><U1111><U1161><U0029>"
|
||
% CIRCLED HANGUL HIEUH A
|
||
<U327B> "<U0028><U1112><U1161><U0029>"
|
||
+% CIRCLED KOREAN CHARACTER CHAMKO
|
||
+<U327C> "<U0028><U110E><U1161><U11B7><U1100><U1169><U0029>"
|
||
+% CIRCLED KOREAN CHARACTER JUEUI
|
||
+<U327D> "<U0028><U110C><U116E><U110B><U1174><U0029>"
|
||
+% CIRCLED HANGUL IEUNG U
|
||
+<U327E> "<U0028><U110B><U116E><U0029>"
|
||
% CIRCLED IDEOGRAPH ONE
|
||
<U3280> "<U0028><U4E00><U0029>"
|
||
% CIRCLED IDEOGRAPH TWO
|
||
@@ -464,6 +476,18 @@ translit_start
|
||
<U32FD> "<U0028><U30F1><U0029>"
|
||
% CIRCLED KATAKANA WO
|
||
<U32FE> "<U0028><U30F2><U0029>"
|
||
+% CIRCLED ITALIC LATIN CAPITAL LETTER C
|
||
+<U0001F12B> "<U0028><U0043><U0029>"
|
||
+% CIRCLED ITALIC LATIN CAPITAL LETTER R
|
||
+<U0001F12C> "<U0028><U0052><U0029>"
|
||
+% CIRCLED CD
|
||
+<U0001F12D> "<U0028><U0043><U0044><U0029>"
|
||
+% CIRCLED WZ
|
||
+<U0001F12E> "<U0028><U0057><U005A><U0029>"
|
||
+% CIRCLED IDEOGRAPH ADVANTAGE
|
||
+<U0001F250> "<U0028><U5F97><U0029>"
|
||
+% CIRCLED IDEOGRAPH ACCEPT
|
||
+<U0001F251> "<U0028><U53EF><U0029>"
|
||
|
||
translit_end
|
||
|
||
diff --git a/localedata/locales/translit_cjk_compat b/localedata/locales/translit_cjk_compat
|
||
index c73e5e3..a20c6ca 100644
|
||
--- a/localedata/locales/translit_cjk_compat
|
||
+++ b/localedata/locales/translit_cjk_compat
|
||
@@ -2,18 +2,22 @@ escape_char /
|
||
comment_char %
|
||
|
||
% Transliterations of CJK compatibility characters.
|
||
-% Generated through
|
||
-% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;<square>[^;]*;' UnicodeData.txt | \
|
||
-% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;<square> \([^;]*\);.*$/<U\1> "<U\3>"% \2/' | sed -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1><U\2/g' -e 'x' -e 's/^[^%]*\(% .*\)$/\1/' -e 'G'
|
||
-% and
|
||
-% $ grep '[^;]*;CJK COMPATIBILITY IDEOGRAPH[^;]*;[^;]*;[^;]*;[^;]*;[^;]' UnicodeData.txt | \
|
||
-% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;\([^;]*\);.*$/<U\1> <U\3>% \2/' | sed -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1><U\2/g' -e 'x' -e 's/^[^%]*\(% .*\)$/\1/' -e 'G' | \
|
||
-% sed -e 's/<U\(.....\)>/<U000\1>/g'
|
||
+% Generated automatically from UnicodeData.txt by gen_translit_cjk_compat.py on 2015-06-10 for Unicode 7.0.0.
|
||
|
||
LC_CTYPE
|
||
|
||
translit_start
|
||
|
||
+% PARTNERSHIP SIGN
|
||
+<U3250> "<U0050><U0054><U0045>"
|
||
+% SQUARE HG
|
||
+<U32CC> "<U0048><U0067>"
|
||
+% SQUARE ERG
|
||
+<U32CD> "<U0065><U0072><U0067>"
|
||
+% SQUARE EV
|
||
+<U32CE> "<U0065><U0056>"
|
||
+% LIMITED LIABILITY SIGN
|
||
+<U32CF> "<U004C><U0054><U0044>"
|
||
% SQUARE APAATO
|
||
<U3300> "<U30A2><U30D1><U30FC><U30C8>"
|
||
% SQUARE ARUHUA
|
||
@@ -202,6 +206,14 @@ translit_start
|
||
<U3375> "<U006F><U0056>"
|
||
% SQUARE PC
|
||
<U3376> "<U0070><U0063>"
|
||
+% SQUARE DM
|
||
+<U3377> "<U0064><U006D>"
|
||
+% SQUARE DM SQUARED
|
||
+<U3378> "<U0064><U006D><U00B2>";"<U0064><U006D><U005E><U0032>"
|
||
+% SQUARE DM CUBED
|
||
+<U3379> "<U0064><U006D><U00B3>";"<U0064><U006D><U005E><U0033>"
|
||
+% SQUARE IU
|
||
+<U337A> "<U0049><U0055>"
|
||
% SQUARE ERA NAME HEISEI
|
||
<U337B> "<U5E73><U6210>"
|
||
% SQUARE ERA NAME SYOUWA
|
||
@@ -400,6 +412,170 @@ translit_start
|
||
<U33DC> "<U0053><U0076>"
|
||
% SQUARE WB
|
||
<U33DD> "<U0057><U0062>"
|
||
+% SQUARE V OVER M
|
||
+<U33DE> "<U0056><U2215><U006D>";"<U0056><U002F><U006D>"
|
||
+% SQUARE A OVER M
|
||
+<U33DF> "<U0041><U2215><U006D>";"<U0041><U002F><U006D>"
|
||
+% SQUARE GAL
|
||
+<U33FF> "<U0067><U0061><U006C>"
|
||
+% SQUARED LATIN CAPITAL LETTER A
|
||
+<U0001F130> <U0041>
|
||
+% SQUARED LATIN CAPITAL LETTER B
|
||
+<U0001F131> <U0042>
|
||
+% SQUARED LATIN CAPITAL LETTER C
|
||
+<U0001F132> <U0043>
|
||
+% SQUARED LATIN CAPITAL LETTER D
|
||
+<U0001F133> <U0044>
|
||
+% SQUARED LATIN CAPITAL LETTER E
|
||
+<U0001F134> <U0045>
|
||
+% SQUARED LATIN CAPITAL LETTER F
|
||
+<U0001F135> <U0046>
|
||
+% SQUARED LATIN CAPITAL LETTER G
|
||
+<U0001F136> <U0047>
|
||
+% SQUARED LATIN CAPITAL LETTER H
|
||
+<U0001F137> <U0048>
|
||
+% SQUARED LATIN CAPITAL LETTER I
|
||
+<U0001F138> <U0049>
|
||
+% SQUARED LATIN CAPITAL LETTER J
|
||
+<U0001F139> <U004A>
|
||
+% SQUARED LATIN CAPITAL LETTER K
|
||
+<U0001F13A> <U004B>
|
||
+% SQUARED LATIN CAPITAL LETTER L
|
||
+<U0001F13B> <U004C>
|
||
+% SQUARED LATIN CAPITAL LETTER M
|
||
+<U0001F13C> <U004D>
|
||
+% SQUARED LATIN CAPITAL LETTER N
|
||
+<U0001F13D> <U004E>
|
||
+% SQUARED LATIN CAPITAL LETTER O
|
||
+<U0001F13E> <U004F>
|
||
+% SQUARED LATIN CAPITAL LETTER P
|
||
+<U0001F13F> <U0050>
|
||
+% SQUARED LATIN CAPITAL LETTER Q
|
||
+<U0001F140> <U0051>
|
||
+% SQUARED LATIN CAPITAL LETTER R
|
||
+<U0001F141> <U0052>
|
||
+% SQUARED LATIN CAPITAL LETTER S
|
||
+<U0001F142> <U0053>
|
||
+% SQUARED LATIN CAPITAL LETTER T
|
||
+<U0001F143> <U0054>
|
||
+% SQUARED LATIN CAPITAL LETTER U
|
||
+<U0001F144> <U0055>
|
||
+% SQUARED LATIN CAPITAL LETTER V
|
||
+<U0001F145> <U0056>
|
||
+% SQUARED LATIN CAPITAL LETTER W
|
||
+<U0001F146> <U0057>
|
||
+% SQUARED LATIN CAPITAL LETTER X
|
||
+<U0001F147> <U0058>
|
||
+% SQUARED LATIN CAPITAL LETTER Y
|
||
+<U0001F148> <U0059>
|
||
+% SQUARED LATIN CAPITAL LETTER Z
|
||
+<U0001F149> <U005A>
|
||
+% SQUARED HV
|
||
+<U0001F14A> "<U0048><U0056>"
|
||
+% SQUARED MV
|
||
+<U0001F14B> "<U004D><U0056>"
|
||
+% SQUARED SD
|
||
+<U0001F14C> "<U0053><U0044>"
|
||
+% SQUARED SS
|
||
+<U0001F14D> "<U0053><U0053>"
|
||
+% SQUARED PPV
|
||
+<U0001F14E> "<U0050><U0050><U0056>"
|
||
+% SQUARED WC
|
||
+<U0001F14F> "<U0057><U0043>"
|
||
+% SQUARE DJ
|
||
+<U0001F190> "<U0044><U004A>"
|
||
+% SQUARE HIRAGANA HOKA
|
||
+<U0001F200> "<U307B><U304B>"
|
||
+% SQUARED KATAKANA KOKO
|
||
+<U0001F201> "<U30B3><U30B3>"
|
||
+% SQUARED KATAKANA SA
|
||
+<U0001F202> <U30B5>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-624B
|
||
+<U0001F210> <U624B>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-5B57
|
||
+<U0001F211> <U5B57>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-53CC
|
||
+<U0001F212> <U53CC>
|
||
+% SQUARED KATAKANA DE
|
||
+<U0001F213> <U30C7>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-4E8C
|
||
+<U0001F214> <U4E8C>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-591A
|
||
+<U0001F215> <U591A>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-89E3
|
||
+<U0001F216> <U89E3>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-5929
|
||
+<U0001F217> <U5929>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-4EA4
|
||
+<U0001F218> <U4EA4>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-6620
|
||
+<U0001F219> <U6620>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-7121
|
||
+<U0001F21A> <U7121>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-6599
|
||
+<U0001F21B> <U6599>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-524D
|
||
+<U0001F21C> <U524D>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-5F8C
|
||
+<U0001F21D> <U5F8C>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-518D
|
||
+<U0001F21E> <U518D>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-65B0
|
||
+<U0001F21F> <U65B0>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-521D
|
||
+<U0001F220> <U521D>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-7D42
|
||
+<U0001F221> <U7D42>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-751F
|
||
+<U0001F222> <U751F>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-8CA9
|
||
+<U0001F223> <U8CA9>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-58F0
|
||
+<U0001F224> <U58F0>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-5439
|
||
+<U0001F225> <U5439>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-6F14
|
||
+<U0001F226> <U6F14>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-6295
|
||
+<U0001F227> <U6295>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-6355
|
||
+<U0001F228> <U6355>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-4E00
|
||
+<U0001F229> <U4E00>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-4E09
|
||
+<U0001F22A> <U4E09>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-904A
|
||
+<U0001F22B> <U904A>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-5DE6
|
||
+<U0001F22C> <U5DE6>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-4E2D
|
||
+<U0001F22D> <U4E2D>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-53F3
|
||
+<U0001F22E> <U53F3>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-6307
|
||
+<U0001F22F> <U6307>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-8D70
|
||
+<U0001F230> <U8D70>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-6253
|
||
+<U0001F231> <U6253>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-7981
|
||
+<U0001F232> <U7981>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-7A7A
|
||
+<U0001F233> <U7A7A>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-5408
|
||
+<U0001F234> <U5408>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-6E80
|
||
+<U0001F235> <U6E80>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-6709
|
||
+<U0001F236> <U6709>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-6708
|
||
+<U0001F237> <U6708>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-7533
|
||
+<U0001F238> <U7533>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-5272
|
||
+<U0001F239> <U5272>
|
||
+% SQUARED CJK UNIFIED IDEOGRAPH-55B6
|
||
+<U0001F23A> <U55B6>
|
||
% CJK COMPATIBILITY IDEOGRAPH-F900
|
||
<UF900> <U8C48>
|
||
% CJK COMPATIBILITY IDEOGRAPH-F901
|
||
@@ -980,6 +1156,10 @@ translit_start
|
||
<UFA2C> <U9928>
|
||
% CJK COMPATIBILITY IDEOGRAPH-FA2D
|
||
<UFA2D> <U9DB4>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA2E
|
||
+<UFA2E> <U90DE>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA2F
|
||
+<UFA2F> <U96B7>
|
||
% CJK COMPATIBILITY IDEOGRAPH-FA30
|
||
<UFA30> <U4FAE>
|
||
% CJK COMPATIBILITY IDEOGRAPH-FA31
|
||
@@ -1098,6 +1278,224 @@ translit_start
|
||
<UFA69> <U97FF>
|
||
% CJK COMPATIBILITY IDEOGRAPH-FA6A
|
||
<UFA6A> <U983B>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA6B
|
||
+<UFA6B> <U6075>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA6C
|
||
+<UFA6C> <U000242EE>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA6D
|
||
+<UFA6D> <U8218>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA70
|
||
+<UFA70> <U4E26>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA71
|
||
+<UFA71> <U51B5>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA72
|
||
+<UFA72> <U5168>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA73
|
||
+<UFA73> <U4F80>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA74
|
||
+<UFA74> <U5145>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA75
|
||
+<UFA75> <U5180>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA76
|
||
+<UFA76> <U52C7>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA77
|
||
+<UFA77> <U52FA>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA78
|
||
+<UFA78> <U559D>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA79
|
||
+<UFA79> <U5555>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA7A
|
||
+<UFA7A> <U5599>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA7B
|
||
+<UFA7B> <U55E2>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA7C
|
||
+<UFA7C> <U585A>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA7D
|
||
+<UFA7D> <U58B3>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA7E
|
||
+<UFA7E> <U5944>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA7F
|
||
+<UFA7F> <U5954>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA80
|
||
+<UFA80> <U5A62>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA81
|
||
+<UFA81> <U5B28>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA82
|
||
+<UFA82> <U5ED2>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA83
|
||
+<UFA83> <U5ED9>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA84
|
||
+<UFA84> <U5F69>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA85
|
||
+<UFA85> <U5FAD>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA86
|
||
+<UFA86> <U60D8>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA87
|
||
+<UFA87> <U614E>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA88
|
||
+<UFA88> <U6108>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA89
|
||
+<UFA89> <U618E>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA8A
|
||
+<UFA8A> <U6160>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA8B
|
||
+<UFA8B> <U61F2>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA8C
|
||
+<UFA8C> <U6234>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA8D
|
||
+<UFA8D> <U63C4>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA8E
|
||
+<UFA8E> <U641C>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA8F
|
||
+<UFA8F> <U6452>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA90
|
||
+<UFA90> <U6556>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA91
|
||
+<UFA91> <U6674>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA92
|
||
+<UFA92> <U6717>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA93
|
||
+<UFA93> <U671B>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA94
|
||
+<UFA94> <U6756>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA95
|
||
+<UFA95> <U6B79>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA96
|
||
+<UFA96> <U6BBA>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA97
|
||
+<UFA97> <U6D41>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA98
|
||
+<UFA98> <U6EDB>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA99
|
||
+<UFA99> <U6ECB>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA9A
|
||
+<UFA9A> <U6F22>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA9B
|
||
+<UFA9B> <U701E>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA9C
|
||
+<UFA9C> <U716E>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA9D
|
||
+<UFA9D> <U77A7>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA9E
|
||
+<UFA9E> <U7235>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FA9F
|
||
+<UFA9F> <U72AF>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAA0
|
||
+<UFAA0> <U732A>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAA1
|
||
+<UFAA1> <U7471>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAA2
|
||
+<UFAA2> <U7506>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAA3
|
||
+<UFAA3> <U753B>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAA4
|
||
+<UFAA4> <U761D>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAA5
|
||
+<UFAA5> <U761F>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAA6
|
||
+<UFAA6> <U76CA>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAA7
|
||
+<UFAA7> <U76DB>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAA8
|
||
+<UFAA8> <U76F4>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAA9
|
||
+<UFAA9> <U774A>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAAA
|
||
+<UFAAA> <U7740>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAAB
|
||
+<UFAAB> <U78CC>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAAC
|
||
+<UFAAC> <U7AB1>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAAD
|
||
+<UFAAD> <U7BC0>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAAE
|
||
+<UFAAE> <U7C7B>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAAF
|
||
+<UFAAF> <U7D5B>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAB0
|
||
+<UFAB0> <U7DF4>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAB1
|
||
+<UFAB1> <U7F3E>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAB2
|
||
+<UFAB2> <U8005>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAB3
|
||
+<UFAB3> <U8352>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAB4
|
||
+<UFAB4> <U83EF>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAB5
|
||
+<UFAB5> <U8779>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAB6
|
||
+<UFAB6> <U8941>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAB7
|
||
+<UFAB7> <U8986>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAB8
|
||
+<UFAB8> <U8996>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAB9
|
||
+<UFAB9> <U8ABF>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FABA
|
||
+<UFABA> <U8AF8>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FABB
|
||
+<UFABB> <U8ACB>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FABC
|
||
+<UFABC> <U8B01>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FABD
|
||
+<UFABD> <U8AFE>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FABE
|
||
+<UFABE> <U8AED>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FABF
|
||
+<UFABF> <U8B39>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAC0
|
||
+<UFAC0> <U8B8A>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAC1
|
||
+<UFAC1> <U8D08>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAC2
|
||
+<UFAC2> <U8F38>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAC3
|
||
+<UFAC3> <U9072>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAC4
|
||
+<UFAC4> <U9199>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAC5
|
||
+<UFAC5> <U9276>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAC6
|
||
+<UFAC6> <U967C>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAC7
|
||
+<UFAC7> <U96E3>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAC8
|
||
+<UFAC8> <U9756>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAC9
|
||
+<UFAC9> <U97DB>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FACA
|
||
+<UFACA> <U97FF>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FACB
|
||
+<UFACB> <U980B>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FACC
|
||
+<UFACC> <U983B>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FACD
|
||
+<UFACD> <U9B12>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FACE
|
||
+<UFACE> <U9F9C>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FACF
|
||
+<UFACF> <U0002284A>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAD0
|
||
+<UFAD0> <U00022844>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAD1
|
||
+<UFAD1> <U000233D5>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAD2
|
||
+<UFAD2> <U3B9D>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAD3
|
||
+<UFAD3> <U4018>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAD4
|
||
+<UFAD4> <U4039>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAD5
|
||
+<UFAD5> <U00025249>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAD6
|
||
+<UFAD6> <U00025CD0>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAD7
|
||
+<UFAD7> <U00027ED3>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAD8
|
||
+<UFAD8> <U9F43>
|
||
+% CJK COMPATIBILITY IDEOGRAPH-FAD9
|
||
+<UFAD9> <U9F8E>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F800
|
||
<U0002F800> <U4E3D>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F801
|
||
@@ -1307,7 +1705,7 @@ translit_start
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F867
|
||
<U0002F867> <U36EE>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F868
|
||
-<U0002F868> <U0002136A>
|
||
+<U0002F868> <U36FC>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F869
|
||
<U0002F869> <U5B08>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F86A
|
||
@@ -1331,7 +1729,7 @@ translit_start
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F873
|
||
<U0002F873> <U5C06>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F874
|
||
-<U0002F874> <U5F33>
|
||
+<U0002F874> <U5F53>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F875
|
||
<U0002F875> <U5C22>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F876
|
||
@@ -1673,7 +2071,7 @@ translit_start
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F91E
|
||
<U0002F91E> <U719C>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F91F
|
||
-<U0002F91F> <U43AB>
|
||
+<U0002F91F> <U000243AB>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F920
|
||
<U0002F920> <U7228>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F921
|
||
@@ -1801,7 +2199,7 @@ translit_start
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F95E
|
||
<U0002F95E> <U00025AA7>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F95F
|
||
-<U0002F95F> <U7AAE>
|
||
+<U0002F95F> <U7AEE>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F960
|
||
<U0002F960> <U4202>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F961
|
||
@@ -1993,7 +2391,7 @@ translit_start
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F9BE
|
||
<U0002F9BE> <U8786>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F9BF
|
||
-<U0002F9BF> <U4D57>
|
||
+<U0002F9BF> <U45D7>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F9C0
|
||
<U0002F9C0> <U87E1>
|
||
% CJK COMPATIBILITY IDEOGRAPH-2F9C1
|
||
diff --git a/localedata/locales/translit_combining b/localedata/locales/translit_combining
|
||
index 44c62f9..b1b5345 100644
|
||
--- a/localedata/locales/translit_combining
|
||
+++ b/localedata/locales/translit_combining
|
||
@@ -3,7 +3,7 @@ comment_char %
|
||
|
||
% Transliterations that remove all combining characters (accents,
|
||
% pronounciation marks, etc.).
|
||
-% Generated from UnicodeData.txt.
|
||
+% Generated automatically from UnicodeData.txt by gen_translit_combining.py on 2015-06-10 for Unicode 7.0.0.
|
||
|
||
LC_CTYPE
|
||
|
||
@@ -167,6 +167,40 @@ translit_start
|
||
<U034D> ""
|
||
% COMBINING UPWARDS ARROW BELOW
|
||
<U034E> ""
|
||
+% COMBINING GRAPHEME JOINER
|
||
+<U034F> ""
|
||
+% COMBINING RIGHT ARROWHEAD ABOVE
|
||
+<U0350> ""
|
||
+% COMBINING LEFT HALF RING ABOVE
|
||
+<U0351> ""
|
||
+% COMBINING FERMATA
|
||
+<U0352> ""
|
||
+% COMBINING X BELOW
|
||
+<U0353> ""
|
||
+% COMBINING LEFT ARROWHEAD BELOW
|
||
+<U0354> ""
|
||
+% COMBINING RIGHT ARROWHEAD BELOW
|
||
+<U0355> ""
|
||
+% COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW
|
||
+<U0356> ""
|
||
+% COMBINING RIGHT HALF RING ABOVE
|
||
+<U0357> ""
|
||
+% COMBINING DOT ABOVE RIGHT
|
||
+<U0358> ""
|
||
+% COMBINING ASTERISK BELOW
|
||
+<U0359> ""
|
||
+% COMBINING DOUBLE RING BELOW
|
||
+<U035A> ""
|
||
+% COMBINING ZIGZAG ABOVE
|
||
+<U035B> ""
|
||
+% COMBINING DOUBLE BREVE BELOW
|
||
+<U035C> ""
|
||
+% COMBINING DOUBLE BREVE
|
||
+<U035D> ""
|
||
+% COMBINING DOUBLE MACRON
|
||
+<U035E> ""
|
||
+% COMBINING DOUBLE MACRON BELOW
|
||
+<U035F> ""
|
||
% COMBINING DOUBLE TILDE
|
||
<U0360> ""
|
||
% COMBINING DOUBLE INVERTED BREVE
|
||
@@ -199,6 +233,68 @@ translit_start
|
||
<U036E> ""
|
||
% COMBINING LATIN SMALL LETTER X
|
||
<U036F> ""
|
||
+% HEBREW ACCENT ETNAHTA
|
||
+<U0591> ""
|
||
+% HEBREW ACCENT SEGOL
|
||
+<U0592> ""
|
||
+% HEBREW ACCENT SHALSHELET
|
||
+<U0593> ""
|
||
+% HEBREW ACCENT ZAQEF QATAN
|
||
+<U0594> ""
|
||
+% HEBREW ACCENT ZAQEF GADOL
|
||
+<U0595> ""
|
||
+% HEBREW ACCENT TIPEHA
|
||
+<U0596> ""
|
||
+% HEBREW ACCENT REVIA
|
||
+<U0597> ""
|
||
+% HEBREW ACCENT ZARQA
|
||
+<U0598> ""
|
||
+% HEBREW ACCENT PASHTA
|
||
+<U0599> ""
|
||
+% HEBREW ACCENT YETIV
|
||
+<U059A> ""
|
||
+% HEBREW ACCENT TEVIR
|
||
+<U059B> ""
|
||
+% HEBREW ACCENT GERESH
|
||
+<U059C> ""
|
||
+% HEBREW ACCENT GERESH MUQDAM
|
||
+<U059D> ""
|
||
+% HEBREW ACCENT GERSHAYIM
|
||
+<U059E> ""
|
||
+% HEBREW ACCENT QARNEY PARA
|
||
+<U059F> ""
|
||
+% HEBREW ACCENT TELISHA GEDOLA
|
||
+<U05A0> ""
|
||
+% HEBREW ACCENT PAZER
|
||
+<U05A1> ""
|
||
+% HEBREW ACCENT ATNAH HAFUKH
|
||
+<U05A2> ""
|
||
+% HEBREW ACCENT MUNAH
|
||
+<U05A3> ""
|
||
+% HEBREW ACCENT MAHAPAKH
|
||
+<U05A4> ""
|
||
+% HEBREW ACCENT MERKHA
|
||
+<U05A5> ""
|
||
+% HEBREW ACCENT MERKHA KEFULA
|
||
+<U05A6> ""
|
||
+% HEBREW ACCENT DARGA
|
||
+<U05A7> ""
|
||
+% HEBREW ACCENT QADMA
|
||
+<U05A8> ""
|
||
+% HEBREW ACCENT TELISHA QETANA
|
||
+<U05A9> ""
|
||
+% HEBREW ACCENT YERAH BEN YOMO
|
||
+<U05AA> ""
|
||
+% HEBREW ACCENT OLE
|
||
+<U05AB> ""
|
||
+% HEBREW ACCENT ILUY
|
||
+<U05AC> ""
|
||
+% HEBREW ACCENT DEHI
|
||
+<U05AD> ""
|
||
+% HEBREW ACCENT ZINOR
|
||
+<U05AE> ""
|
||
+% HEBREW MARK MASORA CIRCLE
|
||
+<U05AF> ""
|
||
% HEBREW POINT SHEVA
|
||
<U05B0> ""
|
||
% HEBREW POINT HATAF SEGOL
|
||
@@ -219,6 +315,8 @@ translit_start
|
||
<U05B8> ""
|
||
% HEBREW POINT HOLAM
|
||
<U05B9> ""
|
||
+% HEBREW POINT HOLAM HASER FOR VAV
|
||
+<U05BA> ""
|
||
% HEBREW POINT QUBUTS
|
||
<U05BB> ""
|
||
% HEBREW POINT DAGESH OR MAPIQ
|
||
@@ -231,12 +329,358 @@ translit_start
|
||
<U05C1> ""
|
||
% HEBREW POINT SIN DOT
|
||
<U05C2> ""
|
||
+% HEBREW MARK UPPER DOT
|
||
+<U05C4> ""
|
||
+% HEBREW MARK LOWER DOT
|
||
+<U05C5> ""
|
||
+% HEBREW POINT QAMATS QATAN
|
||
+<U05C7> ""
|
||
+% ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM
|
||
+<U0610> ""
|
||
+% ARABIC SIGN ALAYHE ASSALLAM
|
||
+<U0611> ""
|
||
+% ARABIC SIGN RAHMATULLAH ALAYHE
|
||
+<U0612> ""
|
||
+% ARABIC SIGN RADI ALLAHOU ANHU
|
||
+<U0613> ""
|
||
+% ARABIC SIGN TAKHALLUS
|
||
+<U0614> ""
|
||
+% ARABIC SMALL HIGH TAH
|
||
+<U0615> ""
|
||
+% ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
|
||
+<U0616> ""
|
||
+% ARABIC SMALL HIGH ZAIN
|
||
+<U0617> ""
|
||
+% ARABIC SMALL FATHA
|
||
+<U0618> ""
|
||
+% ARABIC SMALL DAMMA
|
||
+<U0619> ""
|
||
+% ARABIC SMALL KASRA
|
||
+<U061A> ""
|
||
+% ARABIC FATHATAN
|
||
+<U064B> ""
|
||
+% ARABIC DAMMATAN
|
||
+<U064C> ""
|
||
+% ARABIC KASRATAN
|
||
+<U064D> ""
|
||
+% ARABIC FATHA
|
||
+<U064E> ""
|
||
+% ARABIC DAMMA
|
||
+<U064F> ""
|
||
+% ARABIC KASRA
|
||
+<U0650> ""
|
||
+% ARABIC SHADDA
|
||
+<U0651> ""
|
||
+% ARABIC SUKUN
|
||
+<U0652> ""
|
||
% ARABIC MADDAH ABOVE
|
||
<U0653> ""
|
||
% ARABIC HAMZA ABOVE
|
||
<U0654> ""
|
||
% ARABIC HAMZA BELOW
|
||
<U0655> ""
|
||
+% ARABIC SUBSCRIPT ALEF
|
||
+<U0656> ""
|
||
+% ARABIC INVERTED DAMMA
|
||
+<U0657> ""
|
||
+% ARABIC MARK NOON GHUNNA
|
||
+<U0658> ""
|
||
+% ARABIC ZWARAKAY
|
||
+<U0659> ""
|
||
+% ARABIC VOWEL SIGN SMALL V ABOVE
|
||
+<U065A> ""
|
||
+% ARABIC VOWEL SIGN INVERTED SMALL V ABOVE
|
||
+<U065B> ""
|
||
+% ARABIC VOWEL SIGN DOT BELOW
|
||
+<U065C> ""
|
||
+% ARABIC REVERSED DAMMA
|
||
+<U065D> ""
|
||
+% ARABIC FATHA WITH TWO DOTS
|
||
+<U065E> ""
|
||
+% ARABIC WAVY HAMZA BELOW
|
||
+<U065F> ""
|
||
+% ARABIC LETTER SUPERSCRIPT ALEF
|
||
+<U0670> ""
|
||
+% ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
|
||
+<U06D6> ""
|
||
+% ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
|
||
+<U06D7> ""
|
||
+% ARABIC SMALL HIGH MEEM INITIAL FORM
|
||
+<U06D8> ""
|
||
+% ARABIC SMALL HIGH LAM ALEF
|
||
+<U06D9> ""
|
||
+% ARABIC SMALL HIGH JEEM
|
||
+<U06DA> ""
|
||
+% ARABIC SMALL HIGH THREE DOTS
|
||
+<U06DB> ""
|
||
+% ARABIC SMALL HIGH SEEN
|
||
+<U06DC> ""
|
||
+% ARABIC SMALL HIGH ROUNDED ZERO
|
||
+<U06DF> ""
|
||
+% ARABIC SMALL HIGH UPRIGHT RECTANGULAR ZERO
|
||
+<U06E0> ""
|
||
+% ARABIC SMALL HIGH DOTLESS HEAD OF KHAH
|
||
+<U06E1> ""
|
||
+% ARABIC SMALL HIGH MEEM ISOLATED FORM
|
||
+<U06E2> ""
|
||
+% ARABIC SMALL LOW SEEN
|
||
+<U06E3> ""
|
||
+% ARABIC SMALL HIGH MADDA
|
||
+<U06E4> ""
|
||
+% ARABIC SMALL HIGH YEH
|
||
+<U06E7> ""
|
||
+% ARABIC SMALL HIGH NOON
|
||
+<U06E8> ""
|
||
+% ARABIC EMPTY CENTRE LOW STOP
|
||
+<U06EA> ""
|
||
+% ARABIC EMPTY CENTRE HIGH STOP
|
||
+<U06EB> ""
|
||
+% ARABIC ROUNDED HIGH STOP WITH FILLED CENTRE
|
||
+<U06EC> ""
|
||
+% ARABIC SMALL LOW MEEM
|
||
+<U06ED> ""
|
||
+% ARABIC CURLY FATHA
|
||
+<U08E4> ""
|
||
+% ARABIC CURLY DAMMA
|
||
+<U08E5> ""
|
||
+% ARABIC CURLY KASRA
|
||
+<U08E6> ""
|
||
+% ARABIC CURLY FATHATAN
|
||
+<U08E7> ""
|
||
+% ARABIC CURLY DAMMATAN
|
||
+<U08E8> ""
|
||
+% ARABIC CURLY KASRATAN
|
||
+<U08E9> ""
|
||
+% ARABIC TONE ONE DOT ABOVE
|
||
+<U08EA> ""
|
||
+% ARABIC TONE TWO DOTS ABOVE
|
||
+<U08EB> ""
|
||
+% ARABIC TONE LOOP ABOVE
|
||
+<U08EC> ""
|
||
+% ARABIC TONE ONE DOT BELOW
|
||
+<U08ED> ""
|
||
+% ARABIC TONE TWO DOTS BELOW
|
||
+<U08EE> ""
|
||
+% ARABIC TONE LOOP BELOW
|
||
+<U08EF> ""
|
||
+% ARABIC OPEN FATHATAN
|
||
+<U08F0> ""
|
||
+% ARABIC OPEN DAMMATAN
|
||
+<U08F1> ""
|
||
+% ARABIC OPEN KASRATAN
|
||
+<U08F2> ""
|
||
+% ARABIC SMALL HIGH WAW
|
||
+<U08F3> ""
|
||
+% ARABIC FATHA WITH RING
|
||
+<U08F4> ""
|
||
+% ARABIC FATHA WITH DOT ABOVE
|
||
+<U08F5> ""
|
||
+% ARABIC KASRA WITH DOT BELOW
|
||
+<U08F6> ""
|
||
+% ARABIC LEFT ARROWHEAD ABOVE
|
||
+<U08F7> ""
|
||
+% ARABIC RIGHT ARROWHEAD ABOVE
|
||
+<U08F8> ""
|
||
+% ARABIC LEFT ARROWHEAD BELOW
|
||
+<U08F9> ""
|
||
+% ARABIC RIGHT ARROWHEAD BELOW
|
||
+<U08FA> ""
|
||
+% ARABIC DOUBLE RIGHT ARROWHEAD ABOVE
|
||
+<U08FB> ""
|
||
+% ARABIC DOUBLE RIGHT ARROWHEAD ABOVE WITH DOT
|
||
+<U08FC> ""
|
||
+% ARABIC RIGHT ARROWHEAD ABOVE WITH DOT
|
||
+<U08FD> ""
|
||
+% ARABIC DAMMA WITH DOT
|
||
+<U08FE> ""
|
||
+% ARABIC MARK SIDEWAYS NOON GHUNNA
|
||
+<U08FF> ""
|
||
+% COMBINING DOUBLED CIRCUMFLEX ACCENT
|
||
+<U1AB0> ""
|
||
+% COMBINING DIAERESIS-RING
|
||
+<U1AB1> ""
|
||
+% COMBINING INFINITY
|
||
+<U1AB2> ""
|
||
+% COMBINING DOWNWARDS ARROW
|
||
+<U1AB3> ""
|
||
+% COMBINING TRIPLE DOT
|
||
+<U1AB4> ""
|
||
+% COMBINING X-X BELOW
|
||
+<U1AB5> ""
|
||
+% COMBINING WIGGLY LINE BELOW
|
||
+<U1AB6> ""
|
||
+% COMBINING OPEN MARK BELOW
|
||
+<U1AB7> ""
|
||
+% COMBINING DOUBLE OPEN MARK BELOW
|
||
+<U1AB8> ""
|
||
+% COMBINING LIGHT CENTRALIZATION STROKE BELOW
|
||
+<U1AB9> ""
|
||
+% COMBINING STRONG CENTRALIZATION STROKE BELOW
|
||
+<U1ABA> ""
|
||
+% COMBINING PARENTHESES ABOVE
|
||
+<U1ABB> ""
|
||
+% COMBINING DOUBLE PARENTHESES ABOVE
|
||
+<U1ABC> ""
|
||
+% COMBINING PARENTHESES BELOW
|
||
+<U1ABD> ""
|
||
+% COMBINING PARENTHESES OVERLAY
|
||
+<U1ABE> ""
|
||
+% COMBINING DOTTED GRAVE ACCENT
|
||
+<U1DC0> ""
|
||
+% COMBINING DOTTED ACUTE ACCENT
|
||
+<U1DC1> ""
|
||
+% COMBINING SNAKE BELOW
|
||
+<U1DC2> ""
|
||
+% COMBINING SUSPENSION MARK
|
||
+<U1DC3> ""
|
||
+% COMBINING MACRON-ACUTE
|
||
+<U1DC4> ""
|
||
+% COMBINING GRAVE-MACRON
|
||
+<U1DC5> ""
|
||
+% COMBINING MACRON-GRAVE
|
||
+<U1DC6> ""
|
||
+% COMBINING ACUTE-MACRON
|
||
+<U1DC7> ""
|
||
+% COMBINING GRAVE-ACUTE-GRAVE
|
||
+<U1DC8> ""
|
||
+% COMBINING ACUTE-GRAVE-ACUTE
|
||
+<U1DC9> ""
|
||
+% COMBINING LATIN SMALL LETTER R BELOW
|
||
+<U1DCA> ""
|
||
+% COMBINING BREVE-MACRON
|
||
+<U1DCB> ""
|
||
+% COMBINING MACRON-BREVE
|
||
+<U1DCC> ""
|
||
+% COMBINING DOUBLE CIRCUMFLEX ABOVE
|
||
+<U1DCD> ""
|
||
+% COMBINING OGONEK ABOVE
|
||
+<U1DCE> ""
|
||
+% COMBINING ZIGZAG BELOW
|
||
+<U1DCF> ""
|
||
+% COMBINING IS BELOW
|
||
+<U1DD0> ""
|
||
+% COMBINING UR ABOVE
|
||
+<U1DD1> ""
|
||
+% COMBINING US ABOVE
|
||
+<U1DD2> ""
|
||
+% COMBINING LATIN SMALL LETTER FLATTENED OPEN A ABOVE
|
||
+<U1DD3> ""
|
||
+% COMBINING LATIN SMALL LETTER AE
|
||
+<U1DD4> ""
|
||
+% COMBINING LATIN SMALL LETTER AO
|
||
+<U1DD5> ""
|
||
+% COMBINING LATIN SMALL LETTER AV
|
||
+<U1DD6> ""
|
||
+% COMBINING LATIN SMALL LETTER C CEDILLA
|
||
+<U1DD7> ""
|
||
+% COMBINING LATIN SMALL LETTER INSULAR D
|
||
+<U1DD8> ""
|
||
+% COMBINING LATIN SMALL LETTER ETH
|
||
+<U1DD9> ""
|
||
+% COMBINING LATIN SMALL LETTER G
|
||
+<U1DDA> ""
|
||
+% COMBINING LATIN LETTER SMALL CAPITAL G
|
||
+<U1DDB> ""
|
||
+% COMBINING LATIN SMALL LETTER K
|
||
+<U1DDC> ""
|
||
+% COMBINING LATIN SMALL LETTER L
|
||
+<U1DDD> ""
|
||
+% COMBINING LATIN LETTER SMALL CAPITAL L
|
||
+<U1DDE> ""
|
||
+% COMBINING LATIN LETTER SMALL CAPITAL M
|
||
+<U1DDF> ""
|
||
+% COMBINING LATIN SMALL LETTER N
|
||
+<U1DE0> ""
|
||
+% COMBINING LATIN LETTER SMALL CAPITAL N
|
||
+<U1DE1> ""
|
||
+% COMBINING LATIN LETTER SMALL CAPITAL R
|
||
+<U1DE2> ""
|
||
+% COMBINING LATIN SMALL LETTER R ROTUNDA
|
||
+<U1DE3> ""
|
||
+% COMBINING LATIN SMALL LETTER S
|
||
+<U1DE4> ""
|
||
+% COMBINING LATIN SMALL LETTER LONG S
|
||
+<U1DE5> ""
|
||
+% COMBINING LATIN SMALL LETTER Z
|
||
+<U1DE6> ""
|
||
+% COMBINING LATIN SMALL LETTER ALPHA
|
||
+<U1DE7> ""
|
||
+% COMBINING LATIN SMALL LETTER B
|
||
+<U1DE8> ""
|
||
+% COMBINING LATIN SMALL LETTER BETA
|
||
+<U1DE9> ""
|
||
+% COMBINING LATIN SMALL LETTER SCHWA
|
||
+<U1DEA> ""
|
||
+% COMBINING LATIN SMALL LETTER F
|
||
+<U1DEB> ""
|
||
+% COMBINING LATIN SMALL LETTER L WITH DOUBLE MIDDLE TILDE
|
||
+<U1DEC> ""
|
||
+% COMBINING LATIN SMALL LETTER O WITH LIGHT CENTRALIZATION STROKE
|
||
+<U1DED> ""
|
||
+% COMBINING LATIN SMALL LETTER P
|
||
+<U1DEE> ""
|
||
+% COMBINING LATIN SMALL LETTER ESH
|
||
+<U1DEF> ""
|
||
+% COMBINING LATIN SMALL LETTER U WITH LIGHT CENTRALIZATION STROKE
|
||
+<U1DF0> ""
|
||
+% COMBINING LATIN SMALL LETTER W
|
||
+<U1DF1> ""
|
||
+% COMBINING LATIN SMALL LETTER A WITH DIAERESIS
|
||
+<U1DF2> ""
|
||
+% COMBINING LATIN SMALL LETTER O WITH DIAERESIS
|
||
+<U1DF3> ""
|
||
+% COMBINING LATIN SMALL LETTER U WITH DIAERESIS
|
||
+<U1DF4> ""
|
||
+% COMBINING UP TACK ABOVE
|
||
+<U1DF5> ""
|
||
+% COMBINING DOUBLE INVERTED BREVE BELOW
|
||
+<U1DFC> ""
|
||
+% COMBINING ALMOST EQUAL TO BELOW
|
||
+<U1DFD> ""
|
||
+% COMBINING LEFT ARROWHEAD ABOVE
|
||
+<U1DFE> ""
|
||
+% COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
|
||
+<U1DFF> ""
|
||
+% COMBINING LEFT HARPOON ABOVE
|
||
+<U20D0> ""
|
||
+% COMBINING RIGHT HARPOON ABOVE
|
||
+<U20D1> ""
|
||
+% COMBINING LONG VERTICAL LINE OVERLAY
|
||
+<U20D2> ""
|
||
+% COMBINING SHORT VERTICAL LINE OVERLAY
|
||
+<U20D3> ""
|
||
+% COMBINING ANTICLOCKWISE ARROW ABOVE
|
||
+<U20D4> ""
|
||
+% COMBINING CLOCKWISE ARROW ABOVE
|
||
+<U20D5> ""
|
||
+% COMBINING LEFT ARROW ABOVE
|
||
+<U20D6> ""
|
||
+% COMBINING RIGHT ARROW ABOVE
|
||
+<U20D7> ""
|
||
+% COMBINING RING OVERLAY
|
||
+<U20D8> ""
|
||
+% COMBINING CLOCKWISE RING OVERLAY
|
||
+<U20D9> ""
|
||
+% COMBINING ANTICLOCKWISE RING OVERLAY
|
||
+<U20DA> ""
|
||
+% COMBINING THREE DOTS ABOVE
|
||
+<U20DB> ""
|
||
+% COMBINING FOUR DOTS ABOVE
|
||
+<U20DC> ""
|
||
+% COMBINING ENCLOSING CIRCLE
|
||
+<U20DD> ""
|
||
+% COMBINING ENCLOSING SQUARE
|
||
+<U20DE> ""
|
||
+% COMBINING ENCLOSING DIAMOND
|
||
+<U20DF> ""
|
||
+% COMBINING ENCLOSING CIRCLE BACKSLASH
|
||
+<U20E0> ""
|
||
+% COMBINING LEFT RIGHT ARROW ABOVE
|
||
+<U20E1> ""
|
||
+% COMBINING ENCLOSING SCREEN
|
||
+<U20E2> ""
|
||
+% COMBINING ENCLOSING KEYCAP
|
||
+<U20E3> ""
|
||
% COMBINING ENCLOSING UPWARD POINTING TRIANGLE
|
||
<U20E4> ""
|
||
% COMBINING REVERSE SOLIDUS OVERLAY
|
||
@@ -251,10 +695,70 @@ translit_start
|
||
<U20E9> ""
|
||
% COMBINING LEFTWARDS ARROW OVERLAY
|
||
<U20EA> ""
|
||
+% COMBINING LONG DOUBLE SOLIDUS OVERLAY
|
||
+<U20EB> ""
|
||
+% COMBINING RIGHTWARDS HARPOON WITH BARB DOWNWARDS
|
||
+<U20EC> ""
|
||
+% COMBINING LEFTWARDS HARPOON WITH BARB DOWNWARDS
|
||
+<U20ED> ""
|
||
+% COMBINING LEFT ARROW BELOW
|
||
+<U20EE> ""
|
||
+% COMBINING RIGHT ARROW BELOW
|
||
+<U20EF> ""
|
||
+% COMBINING ASTERISK ABOVE
|
||
+<U20F0> ""
|
||
% COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
|
||
<U3099> ""
|
||
% COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||
<U309A> ""
|
||
+% HEBREW POINT JUDEO-SPANISH VARIKA
|
||
+<UFB1E> ""
|
||
+% COMBINING LIGATURE LEFT HALF
|
||
+<UFE20> ""
|
||
+% COMBINING LIGATURE RIGHT HALF
|
||
+<UFE21> ""
|
||
+% COMBINING DOUBLE TILDE LEFT HALF
|
||
+<UFE22> ""
|
||
+% COMBINING DOUBLE TILDE RIGHT HALF
|
||
+<UFE23> ""
|
||
+% COMBINING MACRON LEFT HALF
|
||
+<UFE24> ""
|
||
+% COMBINING MACRON RIGHT HALF
|
||
+<UFE25> ""
|
||
+% COMBINING CONJOINING MACRON
|
||
+<UFE26> ""
|
||
+% COMBINING LIGATURE LEFT HALF BELOW
|
||
+<UFE27> ""
|
||
+% COMBINING LIGATURE RIGHT HALF BELOW
|
||
+<UFE28> ""
|
||
+% COMBINING TILDE LEFT HALF BELOW
|
||
+<UFE29> ""
|
||
+% COMBINING TILDE RIGHT HALF BELOW
|
||
+<UFE2A> ""
|
||
+% COMBINING MACRON LEFT HALF BELOW
|
||
+<UFE2B> ""
|
||
+% COMBINING MACRON RIGHT HALF BELOW
|
||
+<UFE2C> ""
|
||
+% COMBINING CONJOINING MACRON BELOW
|
||
+<UFE2D> ""
|
||
+% PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE
|
||
+<U000101FD> ""
|
||
+% COMBINING OLD PERMIC LETTER AN
|
||
+<U00010376> ""
|
||
+% COMBINING OLD PERMIC LETTER DOI
|
||
+<U00010377> ""
|
||
+% COMBINING OLD PERMIC LETTER ZATA
|
||
+<U00010378> ""
|
||
+% COMBINING OLD PERMIC LETTER NENOE
|
||
+<U00010379> ""
|
||
+% COMBINING OLD PERMIC LETTER SII
|
||
+<U0001037A> ""
|
||
+% COMBINING GREEK MUSICAL TRISEME
|
||
+<U0001D242> ""
|
||
+% COMBINING GREEK MUSICAL TETRASEME
|
||
+<U0001D243> ""
|
||
+% COMBINING GREEK MUSICAL PENTASEME
|
||
+<U0001D244> ""
|
||
|
||
% LATIN CAPITAL LETTER A WITH GRAVE
|
||
<U00C0> <U0041>
|
||
@@ -268,6 +772,8 @@ translit_start
|
||
<U00C4> <U0041>
|
||
% LATIN CAPITAL LETTER A WITH RING ABOVE
|
||
<U00C5> <U0041>
|
||
+% LATIN CAPITAL LETTER AE
|
||
+<U00C6> "<U0041><U0045>"
|
||
% LATIN CAPITAL LETTER C WITH CEDILLA
|
||
<U00C7> <U0043>
|
||
% LATIN CAPITAL LETTER E WITH GRAVE
|
||
@@ -298,6 +804,8 @@ translit_start
|
||
<U00D5> <U004F>
|
||
% LATIN CAPITAL LETTER O WITH DIAERESIS
|
||
<U00D6> <U004F>
|
||
+% LATIN CAPITAL LETTER O WITH STROKE
|
||
+<U00D8> <U004F>
|
||
% LATIN CAPITAL LETTER U WITH GRAVE
|
||
<U00D9> <U0055>
|
||
% LATIN CAPITAL LETTER U WITH ACUTE
|
||
@@ -320,6 +828,8 @@ translit_start
|
||
<U00E4> <U0061>
|
||
% LATIN SMALL LETTER A WITH RING ABOVE
|
||
<U00E5> <U0061>
|
||
+% LATIN SMALL LETTER AE
|
||
+<U00E6> "<U0061><U0065>"
|
||
% LATIN SMALL LETTER C WITH CEDILLA
|
||
<U00E7> <U0063>
|
||
% LATIN SMALL LETTER E WITH GRAVE
|
||
@@ -350,6 +860,8 @@ translit_start
|
||
<U00F5> <U006F>
|
||
% LATIN SMALL LETTER O WITH DIAERESIS
|
||
<U00F6> <U006F>
|
||
+% LATIN SMALL LETTER O WITH STROKE
|
||
+<U00F8> <U006F>
|
||
% LATIN SMALL LETTER U WITH GRAVE
|
||
<U00F9> <U0075>
|
||
% LATIN SMALL LETTER U WITH ACUTE
|
||
@@ -472,10 +984,6 @@ translit_start
|
||
<U013D> <U004C>
|
||
% LATIN SMALL LETTER L WITH CARON
|
||
<U013E> <U006C>
|
||
-% LATIN CAPITAL LETTER L WITH STROKE
|
||
-<U0141> <U004C>
|
||
-% LATIN SMALL LETTER L WITH STROKE
|
||
-<U0142> <U006C>
|
||
% LATIN CAPITAL LETTER N WITH ACUTE
|
||
<U0143> <U004E>
|
||
% LATIN SMALL LETTER N WITH ACUTE
|
||
@@ -673,9 +1181,9 @@ translit_start
|
||
% LATIN SMALL LETTER AE WITH ACUTE
|
||
<U01FD> <U00E6>;"<U0061><U0065>"
|
||
% LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
|
||
-<U01FE> <U004F>
|
||
+<U01FE> <U00D8>;<U004F>
|
||
% LATIN SMALL LETTER O WITH STROKE AND ACUTE
|
||
-<U01FF> <U006F>
|
||
+<U01FF> <U00F8>;<U006F>
|
||
% LATIN CAPITAL LETTER A WITH DOUBLE GRAVE
|
||
<U0200> <U0041>
|
||
% LATIN SMALL LETTER A WITH DOUBLE GRAVE
|
||
@@ -764,14 +1272,6 @@ translit_start
|
||
<U0232> <U0059>
|
||
% LATIN SMALL LETTER Y WITH MACRON
|
||
<U0233> <U0079>
|
||
-% COMBINING GRAVE TONE MARK
|
||
-<U0340> <U0300>
|
||
-% COMBINING ACUTE TONE MARK
|
||
-<U0341> <U0301>
|
||
-% COMBINING GREEK KORONIS
|
||
-<U0343> <U0313>
|
||
-% COMBINING GREEK DIALYTIKA TONOS
|
||
-<U0344> <U0308>
|
||
% GREEK NUMERAL SIGN
|
||
<U0374> <U02B9>
|
||
% GREEK QUESTION MARK
|
||
@@ -928,6 +1428,8 @@ translit_start
|
||
<U04F8> <U042B>
|
||
% CYRILLIC SMALL LETTER YERU WITH DIAERESIS
|
||
<U04F9> <U044B>
|
||
+% HEBREW LIGATURE YIDDISH DOUBLE YOD
|
||
+<U05F2> "<U05D9><U05D9>"
|
||
% ARABIC LETTER ALEF WITH MADDA ABOVE
|
||
<U0622> <U0627>
|
||
% ARABIC LETTER ALEF WITH HAMZA ABOVE
|
||
@@ -1017,7 +1519,7 @@ translit_start
|
||
% KANNADA VOWEL SIGN O
|
||
<U0CCA> "<U0CC6><U0CC2>"
|
||
% KANNADA VOWEL SIGN OO
|
||
-<U0CCB> "<U0CCA><U0CD5>"
|
||
+<U0CCB> "<U0CC6><U0CC2><U0CD5>"
|
||
% MALAYALAM VOWEL SIGN O
|
||
<U0D4A> "<U0D46><U0D3E>"
|
||
% MALAYALAM VOWEL SIGN OO
|
||
@@ -1029,7 +1531,7 @@ translit_start
|
||
% SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA
|
||
<U0DDC> "<U0DD9><U0DCF>"
|
||
% SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA
|
||
-<U0DDD> "<U0DDC><U0DCA>"
|
||
+<U0DDD> "<U0DD9><U0DCF><U0DCA>"
|
||
% SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA
|
||
<U0DDE> "<U0DD9><U0DDF>"
|
||
% TIBETAN LETTER GHA
|
||
@@ -2020,16 +2522,114 @@ translit_start
|
||
<U2000> <U2002>;<U0020>
|
||
% EM QUAD
|
||
<U2001> <U2003>;<U0020>
|
||
+% EN SPACE
|
||
+<U2002> <U0020>
|
||
+% EM SPACE
|
||
+<U2003> <U0020>
|
||
% OHM SIGN
|
||
<U2126> <U03A9>
|
||
% KELVIN SIGN
|
||
<U212A> <U004B>
|
||
% ANGSTROM SIGN
|
||
-<U212B> <U00C5>
|
||
+<U212B> <U0041>
|
||
+% LEFTWARDS ARROW WITH STROKE
|
||
+<U219A> <U2190>
|
||
+% RIGHTWARDS ARROW WITH STROKE
|
||
+<U219B> <U2192>
|
||
+% LEFT RIGHT ARROW WITH STROKE
|
||
+<U21AE> "<U0021><U003C><U002D><U003E>"
|
||
+% LEFTWARDS DOUBLE ARROW WITH STROKE
|
||
+<U21CD> "<U0021><U003C><U003D>"
|
||
+% LEFT RIGHT DOUBLE ARROW WITH STROKE
|
||
+<U21CE> "<U0021><U003C><U003D><U003E>"
|
||
+% RIGHTWARDS DOUBLE ARROW WITH STROKE
|
||
+<U21CF> "<U0021><U003D><U003E>"
|
||
+% THERE DOES NOT EXIST
|
||
+<U2204> "<U0021><U2203>"
|
||
+% NOT AN ELEMENT OF
|
||
+<U2209> "<U0021><U2208>"
|
||
+% DOES NOT CONTAIN AS MEMBER
|
||
+<U220C> "<U0021><U220B>"
|
||
+% DOES NOT DIVIDE
|
||
+<U2224> "<U0021><U2223>"
|
||
+% NOT PARALLEL TO
|
||
+<U2226> "<U0021><U2225>"
|
||
+% NOT TILDE
|
||
+<U2241> "<U0021><U007E>"
|
||
+% NOT ASYMPTOTICALLY EQUAL TO
|
||
+<U2244> "<U0021><U007E><U002D>"
|
||
+% NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO
|
||
+<U2247> "<U0021><U007E><U003D>"
|
||
+% NOT ALMOST EQUAL TO
|
||
+<U2249> "<U0021><U007E><U007E>"
|
||
+% NOT EQUAL TO
|
||
+<U2260> "<U0021><U003D>"
|
||
+% NOT IDENTICAL TO
|
||
+<U2262> "<U0021><U003D><U003D>"
|
||
+% NOT EQUIVALENT TO
|
||
+<U226D> "<U0021><U224D>"
|
||
+% NOT LESS-THAN
|
||
+<U226E> "<U0021><U003C>"
|
||
+% NOT GREATER-THAN
|
||
+<U226F> "<U0021><U003E>"
|
||
+% NEITHER LESS-THAN NOR EQUAL TO
|
||
+<U2270> "<U0021><U003C><U003D>"
|
||
+% NEITHER GREATER-THAN NOR EQUAL TO
|
||
+<U2271> "<U0021><U003E><U003D>"
|
||
+% NEITHER LESS-THAN NOR EQUIVALENT TO
|
||
+<U2274> "<U0021><U003C><U007E>"
|
||
+% NEITHER GREATER-THAN NOR EQUIVALENT TO
|
||
+<U2275> "<U0021><U003E><U007E>"
|
||
+% NEITHER LESS-THAN NOR GREATER-THAN
|
||
+<U2278> "<U0021><U003C><U003E>"
|
||
+% NEITHER GREATER-THAN NOR LESS-THAN
|
||
+<U2279> "<U0021><U003E><U003C>"
|
||
+% DOES NOT PRECEDE
|
||
+<U2280> "<U0021><U227A>"
|
||
+% DOES NOT SUCCEED
|
||
+<U2281> "<U0021><U227B>"
|
||
+% NOT A SUBSET OF
|
||
+<U2284> "<U0021><U2282>"
|
||
+% NOT A SUPERSET OF
|
||
+<U2285> "<U0021><U2283>"
|
||
+% NEITHER A SUBSET OF NOR EQUAL TO
|
||
+<U2288> "<U0021><U2282><U003D>"
|
||
+% NEITHER A SUPERSET OF NOR EQUAL TO
|
||
+<U2289> "<U0021><U2283><U003D>"
|
||
+% DOES NOT PROVE
|
||
+<U22AC> "<U0021><U22A2>"
|
||
+% NOT TRUE
|
||
+<U22AD> "<U0021><U22A8>"
|
||
+% DOES NOT FORCE
|
||
+<U22AE> "<U0021><U22A9>"
|
||
+% NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
|
||
+<U22AF> "<U0021><U22AB>"
|
||
+% DOES NOT PRECEDE OR EQUAL
|
||
+<U22E0> "<U0021><U227C>"
|
||
+% DOES NOT SUCCEED OR EQUAL
|
||
+<U22E1> "<U0021><U227D>"
|
||
+% NOT SQUARE IMAGE OF OR EQUAL TO
|
||
+<U22E2> "<U0021><U2291>"
|
||
+% NOT SQUARE ORIGINAL OF OR EQUAL TO
|
||
+<U22E3> "<U0021><U2292>"
|
||
+% NOT NORMAL SUBGROUP OF
|
||
+<U22EA> "<U0021><U22B2>"
|
||
+% DOES NOT CONTAIN AS NORMAL SUBGROUP
|
||
+<U22EB> "<U0021><U22B3>"
|
||
+% NOT NORMAL SUBGROUP OF OR EQUAL TO
|
||
+<U22EC> "<U0021><U22B4>"
|
||
+% DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL
|
||
+<U22ED> "<U0021><U22B5>"
|
||
% LEFT-POINTING ANGLE BRACKET
|
||
<U2329> <U3008>;<U003C>
|
||
% RIGHT-POINTING ANGLE BRACKET
|
||
<U232A> <U3009>;<U003E>
|
||
+% FORKING
|
||
+<U2ADC> "<U0021><U2ADD>"
|
||
+% LEFT ANGLE BRACKET
|
||
+<U3008> <U003C>
|
||
+% RIGHT ANGLE BRACKET
|
||
+<U3009> <U003E>
|
||
% HIRAGANA LETTER GA
|
||
<U304C> <U304B>
|
||
% HIRAGANA LETTER GI
|
||
diff --git a/localedata/locales/translit_compat b/localedata/locales/translit_compat
|
||
index bb9d660..6e45220 100644
|
||
--- a/localedata/locales/translit_compat
|
||
+++ b/localedata/locales/translit_compat
|
||
@@ -2,18 +2,24 @@ escape_char /
|
||
comment_char %
|
||
|
||
% Transliterations of compatibility characters and ligatures.
|
||
-% Generated through
|
||
-% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;<compat>[^;]*;' UnicodeData.txt | \
|
||
-% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;<compat> \([^;]*\);.*$/<U\1> "<U\3>"% \2/' | grep -v '0020 03[0-6][0-9A-F]' | sed -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1><U\2/g' -e 'x' -e 's/^[^%]*\(% .*\)$/\1/' -e 'G'
|
||
-% and
|
||
-% $ grep '[^;]*;[^;]*LIGATURE[^;]*;' UnicodeData.txt
|
||
+% Generated automatically from UnicodeData.txt by gen_translit_compat.py on 2015-06-10 for Unicode 7.0.0.
|
||
|
||
LC_CTYPE
|
||
|
||
translit_start
|
||
|
||
+% FEMININE ORDINAL INDICATOR
|
||
+<U00AA> "<U0061>"
|
||
+% SUPERSCRIPT TWO
|
||
+<U00B2> "<U0032>"
|
||
+% SUPERSCRIPT THREE
|
||
+<U00B3> "<U0033>"
|
||
% MICRO SIGN
|
||
-<U00B5> "<U03BC>";<U0075>
|
||
+<U00B5> "<U03BC>";"<U0075>"
|
||
+% SUPERSCRIPT ONE
|
||
+<U00B9> "<U0031>"
|
||
+% MASCULINE ORDINAL INDICATOR
|
||
+<U00BA> "<U006F>"
|
||
% LATIN CAPITAL LIGATURE IJ
|
||
<U0132> "<U0049><U004A>"
|
||
% LATIN SMALL LIGATURE IJ
|
||
@@ -54,6 +60,38 @@ translit_start
|
||
<U01F2> "<U0044><U007A>"
|
||
% LATIN SMALL LETTER DZ
|
||
<U01F3> "<U0064><U007A>"
|
||
+% MODIFIER LETTER SMALL H
|
||
+<U02B0> "<U0068>"
|
||
+% MODIFIER LETTER SMALL H WITH HOOK
|
||
+<U02B1> "<U0266>"
|
||
+% MODIFIER LETTER SMALL J
|
||
+<U02B2> "<U006A>"
|
||
+% MODIFIER LETTER SMALL R
|
||
+<U02B3> "<U0072>"
|
||
+% MODIFIER LETTER SMALL TURNED R
|
||
+<U02B4> "<U0279>"
|
||
+% MODIFIER LETTER SMALL TURNED R WITH HOOK
|
||
+<U02B5> "<U027B>"
|
||
+% MODIFIER LETTER SMALL CAPITAL INVERTED R
|
||
+<U02B6> "<U0281>"
|
||
+% MODIFIER LETTER SMALL W
|
||
+<U02B7> "<U0077>"
|
||
+% MODIFIER LETTER SMALL Y
|
||
+<U02B8> "<U0079>"
|
||
+% MODIFIER LETTER APOSTROPHE
|
||
+<U02BC> "<U0027>"
|
||
+% MODIFIER LETTER SMALL GAMMA
|
||
+<U02E0> "<U0263>"
|
||
+% MODIFIER LETTER SMALL L
|
||
+<U02E1> "<U006C>"
|
||
+% MODIFIER LETTER SMALL S
|
||
+<U02E2> "<U0073>"
|
||
+% MODIFIER LETTER SMALL X
|
||
+<U02E3> "<U0078>"
|
||
+% MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
|
||
+<U02E4> "<U0295>"
|
||
+% GREEK SMALL LETTER MU
|
||
+<U03BC> "<U0075>"
|
||
% GREEK BETA SYMBOL
|
||
<U03D0> "<U03B2>"
|
||
% GREEK THETA SYMBOL
|
||
@@ -74,6 +112,20 @@ translit_start
|
||
<U03F4> "<U0398>"
|
||
% GREEK LUNATE EPSILON SYMBOL
|
||
<U03F5> "<U03B5>"
|
||
+% GREEK CAPITAL LUNATE SIGMA SYMBOL
|
||
+<U03F9> "<U03A3>"
|
||
+% CYRILLIC CAPITAL LIGATURE EN GHE
|
||
+<U04A4> "<U041D><U0413>"
|
||
+% CYRILLIC SMALL LIGATURE EN GHE
|
||
+<U04A5> "<U043D><U0433>"
|
||
+% CYRILLIC CAPITAL LIGATURE TE TSE
|
||
+<U04B4> "<U0422><U0426>"
|
||
+% CYRILLIC SMALL LIGATURE TE TSE
|
||
+<U04B5> "<U0442><U0446>"
|
||
+% CYRILLIC CAPITAL LIGATURE A IE
|
||
+<U04D4> "<U0410><U0415>"
|
||
+% CYRILLIC SMALL LIGATURE A IE
|
||
+<U04D5> "<U0430><U0435>"
|
||
% ARMENIAN SMALL LIGATURE ECH YIWN
|
||
<U0587> "<U0565><U0582>"
|
||
% HEBREW LIGATURE YIDDISH DOUBLE VAV
|
||
@@ -102,6 +154,204 @@ translit_start
|
||
<U0F77> "<U0FB2><U0F81>"
|
||
% TIBETAN VOWEL SIGN VOCALIC LL
|
||
<U0F79> "<U0FB3><U0F81>"
|
||
+% MODIFIER LETTER GEORGIAN NAR
|
||
+<U10FC> "<U10DC>"
|
||
+% MODIFIER LETTER CAPITAL A
|
||
+<U1D2C> "<U0041>"
|
||
+% MODIFIER LETTER CAPITAL AE
|
||
+<U1D2D> "<U00C6>"
|
||
+% MODIFIER LETTER CAPITAL B
|
||
+<U1D2E> "<U0042>"
|
||
+% MODIFIER LETTER CAPITAL D
|
||
+<U1D30> "<U0044>"
|
||
+% MODIFIER LETTER CAPITAL E
|
||
+<U1D31> "<U0045>"
|
||
+% MODIFIER LETTER CAPITAL REVERSED E
|
||
+<U1D32> "<U018E>"
|
||
+% MODIFIER LETTER CAPITAL G
|
||
+<U1D33> "<U0047>"
|
||
+% MODIFIER LETTER CAPITAL H
|
||
+<U1D34> "<U0048>"
|
||
+% MODIFIER LETTER CAPITAL I
|
||
+<U1D35> "<U0049>"
|
||
+% MODIFIER LETTER CAPITAL J
|
||
+<U1D36> "<U004A>"
|
||
+% MODIFIER LETTER CAPITAL K
|
||
+<U1D37> "<U004B>"
|
||
+% MODIFIER LETTER CAPITAL L
|
||
+<U1D38> "<U004C>"
|
||
+% MODIFIER LETTER CAPITAL M
|
||
+<U1D39> "<U004D>"
|
||
+% MODIFIER LETTER CAPITAL N
|
||
+<U1D3A> "<U004E>"
|
||
+% MODIFIER LETTER CAPITAL O
|
||
+<U1D3C> "<U004F>"
|
||
+% MODIFIER LETTER CAPITAL OU
|
||
+<U1D3D> "<U0222>"
|
||
+% MODIFIER LETTER CAPITAL P
|
||
+<U1D3E> "<U0050>"
|
||
+% MODIFIER LETTER CAPITAL R
|
||
+<U1D3F> "<U0052>"
|
||
+% MODIFIER LETTER CAPITAL T
|
||
+<U1D40> "<U0054>"
|
||
+% MODIFIER LETTER CAPITAL U
|
||
+<U1D41> "<U0055>"
|
||
+% MODIFIER LETTER CAPITAL W
|
||
+<U1D42> "<U0057>"
|
||
+% MODIFIER LETTER SMALL A
|
||
+<U1D43> "<U0061>"
|
||
+% MODIFIER LETTER SMALL TURNED A
|
||
+<U1D44> "<U0250>"
|
||
+% MODIFIER LETTER SMALL ALPHA
|
||
+<U1D45> "<U0251>"
|
||
+% MODIFIER LETTER SMALL TURNED AE
|
||
+<U1D46> "<U1D02>"
|
||
+% MODIFIER LETTER SMALL B
|
||
+<U1D47> "<U0062>"
|
||
+% MODIFIER LETTER SMALL D
|
||
+<U1D48> "<U0064>"
|
||
+% MODIFIER LETTER SMALL E
|
||
+<U1D49> "<U0065>"
|
||
+% MODIFIER LETTER SMALL SCHWA
|
||
+<U1D4A> "<U0259>"
|
||
+% MODIFIER LETTER SMALL OPEN E
|
||
+<U1D4B> "<U025B>"
|
||
+% MODIFIER LETTER SMALL TURNED OPEN E
|
||
+<U1D4C> "<U025C>"
|
||
+% MODIFIER LETTER SMALL G
|
||
+<U1D4D> "<U0067>"
|
||
+% MODIFIER LETTER SMALL K
|
||
+<U1D4F> "<U006B>"
|
||
+% MODIFIER LETTER SMALL M
|
||
+<U1D50> "<U006D>"
|
||
+% MODIFIER LETTER SMALL ENG
|
||
+<U1D51> "<U014B>"
|
||
+% MODIFIER LETTER SMALL O
|
||
+<U1D52> "<U006F>"
|
||
+% MODIFIER LETTER SMALL OPEN O
|
||
+<U1D53> "<U0254>"
|
||
+% MODIFIER LETTER SMALL TOP HALF O
|
||
+<U1D54> "<U1D16>"
|
||
+% MODIFIER LETTER SMALL BOTTOM HALF O
|
||
+<U1D55> "<U1D17>"
|
||
+% MODIFIER LETTER SMALL P
|
||
+<U1D56> "<U0070>"
|
||
+% MODIFIER LETTER SMALL T
|
||
+<U1D57> "<U0074>"
|
||
+% MODIFIER LETTER SMALL U
|
||
+<U1D58> "<U0075>"
|
||
+% MODIFIER LETTER SMALL SIDEWAYS U
|
||
+<U1D59> "<U1D1D>"
|
||
+% MODIFIER LETTER SMALL TURNED M
|
||
+<U1D5A> "<U026F>"
|
||
+% MODIFIER LETTER SMALL V
|
||
+<U1D5B> "<U0076>"
|
||
+% MODIFIER LETTER SMALL AIN
|
||
+<U1D5C> "<U1D25>"
|
||
+% MODIFIER LETTER SMALL BETA
|
||
+<U1D5D> "<U03B2>"
|
||
+% MODIFIER LETTER SMALL GREEK GAMMA
|
||
+<U1D5E> "<U03B3>"
|
||
+% MODIFIER LETTER SMALL DELTA
|
||
+<U1D5F> "<U03B4>"
|
||
+% MODIFIER LETTER SMALL GREEK PHI
|
||
+<U1D60> "<U03C6>"
|
||
+% MODIFIER LETTER SMALL CHI
|
||
+<U1D61> "<U03C7>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER I
|
||
+<U1D62> "<U0069>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER R
|
||
+<U1D63> "<U0072>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER U
|
||
+<U1D64> "<U0075>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER V
|
||
+<U1D65> "<U0076>"
|
||
+% GREEK SUBSCRIPT SMALL LETTER BETA
|
||
+<U1D66> "<U03B2>"
|
||
+% GREEK SUBSCRIPT SMALL LETTER GAMMA
|
||
+<U1D67> "<U03B3>"
|
||
+% GREEK SUBSCRIPT SMALL LETTER RHO
|
||
+<U1D68> "<U03C1>"
|
||
+% GREEK SUBSCRIPT SMALL LETTER PHI
|
||
+<U1D69> "<U03C6>"
|
||
+% GREEK SUBSCRIPT SMALL LETTER CHI
|
||
+<U1D6A> "<U03C7>"
|
||
+% MODIFIER LETTER CYRILLIC EN
|
||
+<U1D78> "<U043D>"
|
||
+% MODIFIER LETTER SMALL TURNED ALPHA
|
||
+<U1D9B> "<U0252>"
|
||
+% MODIFIER LETTER SMALL C
|
||
+<U1D9C> "<U0063>"
|
||
+% MODIFIER LETTER SMALL C WITH CURL
|
||
+<U1D9D> "<U0255>"
|
||
+% MODIFIER LETTER SMALL ETH
|
||
+<U1D9E> "<U00F0>"
|
||
+% MODIFIER LETTER SMALL REVERSED OPEN E
|
||
+<U1D9F> "<U025C>"
|
||
+% MODIFIER LETTER SMALL F
|
||
+<U1DA0> "<U0066>"
|
||
+% MODIFIER LETTER SMALL DOTLESS J WITH STROKE
|
||
+<U1DA1> "<U025F>"
|
||
+% MODIFIER LETTER SMALL SCRIPT G
|
||
+<U1DA2> "<U0261>"
|
||
+% MODIFIER LETTER SMALL TURNED H
|
||
+<U1DA3> "<U0265>"
|
||
+% MODIFIER LETTER SMALL I WITH STROKE
|
||
+<U1DA4> "<U0268>"
|
||
+% MODIFIER LETTER SMALL IOTA
|
||
+<U1DA5> "<U0269>"
|
||
+% MODIFIER LETTER SMALL CAPITAL I
|
||
+<U1DA6> "<U026A>"
|
||
+% MODIFIER LETTER SMALL CAPITAL I WITH STROKE
|
||
+<U1DA7> "<U1D7B>"
|
||
+% MODIFIER LETTER SMALL J WITH CROSSED-TAIL
|
||
+<U1DA8> "<U029D>"
|
||
+% MODIFIER LETTER SMALL L WITH RETROFLEX HOOK
|
||
+<U1DA9> "<U026D>"
|
||
+% MODIFIER LETTER SMALL L WITH PALATAL HOOK
|
||
+<U1DAA> "<U1D85>"
|
||
+% MODIFIER LETTER SMALL CAPITAL L
|
||
+<U1DAB> "<U029F>"
|
||
+% MODIFIER LETTER SMALL M WITH HOOK
|
||
+<U1DAC> "<U0271>"
|
||
+% MODIFIER LETTER SMALL TURNED M WITH LONG LEG
|
||
+<U1DAD> "<U0270>"
|
||
+% MODIFIER LETTER SMALL N WITH LEFT HOOK
|
||
+<U1DAE> "<U0272>"
|
||
+% MODIFIER LETTER SMALL N WITH RETROFLEX HOOK
|
||
+<U1DAF> "<U0273>"
|
||
+% MODIFIER LETTER SMALL CAPITAL N
|
||
+<U1DB0> "<U0274>"
|
||
+% MODIFIER LETTER SMALL BARRED O
|
||
+<U1DB1> "<U0275>"
|
||
+% MODIFIER LETTER SMALL PHI
|
||
+<U1DB2> "<U0278>"
|
||
+% MODIFIER LETTER SMALL S WITH HOOK
|
||
+<U1DB3> "<U0282>"
|
||
+% MODIFIER LETTER SMALL ESH
|
||
+<U1DB4> "<U0283>"
|
||
+% MODIFIER LETTER SMALL T WITH PALATAL HOOK
|
||
+<U1DB5> "<U01AB>"
|
||
+% MODIFIER LETTER SMALL U BAR
|
||
+<U1DB6> "<U0289>"
|
||
+% MODIFIER LETTER SMALL UPSILON
|
||
+<U1DB7> "<U028A>"
|
||
+% MODIFIER LETTER SMALL CAPITAL U
|
||
+<U1DB8> "<U1D1C>"
|
||
+% MODIFIER LETTER SMALL V WITH HOOK
|
||
+<U1DB9> "<U028B>"
|
||
+% MODIFIER LETTER SMALL TURNED V
|
||
+<U1DBA> "<U028C>"
|
||
+% MODIFIER LETTER SMALL Z
|
||
+<U1DBB> "<U007A>"
|
||
+% MODIFIER LETTER SMALL Z WITH RETROFLEX HOOK
|
||
+<U1DBC> "<U0290>"
|
||
+% MODIFIER LETTER SMALL Z WITH CURL
|
||
+<U1DBD> "<U0291>"
|
||
+% MODIFIER LETTER SMALL EZH
|
||
+<U1DBE> "<U0292>"
|
||
+% MODIFIER LETTER SMALL THETA
|
||
+<U1DBF> "<U03B8>"
|
||
% LATIN SMALL LETTER A WITH RIGHT HALF RING
|
||
<U1E9A> "<U0061><U02BE>"
|
||
% EN SPACE
|
||
@@ -146,6 +396,90 @@ translit_start
|
||
<U2057> "<U2032><U2032><U2032><U2032>"
|
||
% MEDIUM MATHEMATICAL SPACE
|
||
<U205F> "<U0020>"
|
||
+% SUPERSCRIPT ZERO
|
||
+<U2070> "<U0030>"
|
||
+% SUPERSCRIPT LATIN SMALL LETTER I
|
||
+<U2071> "<U0069>"
|
||
+% SUPERSCRIPT FOUR
|
||
+<U2074> "<U0034>"
|
||
+% SUPERSCRIPT FIVE
|
||
+<U2075> "<U0035>"
|
||
+% SUPERSCRIPT SIX
|
||
+<U2076> "<U0036>"
|
||
+% SUPERSCRIPT SEVEN
|
||
+<U2077> "<U0037>"
|
||
+% SUPERSCRIPT EIGHT
|
||
+<U2078> "<U0038>"
|
||
+% SUPERSCRIPT NINE
|
||
+<U2079> "<U0039>"
|
||
+% SUPERSCRIPT PLUS SIGN
|
||
+<U207A> "<U002B>"
|
||
+% SUPERSCRIPT MINUS
|
||
+<U207B> "<U2212>"
|
||
+% SUPERSCRIPT EQUALS SIGN
|
||
+<U207C> "<U003D>"
|
||
+% SUPERSCRIPT LEFT PARENTHESIS
|
||
+<U207D> "<U0028>"
|
||
+% SUPERSCRIPT RIGHT PARENTHESIS
|
||
+<U207E> "<U0029>"
|
||
+% SUPERSCRIPT LATIN SMALL LETTER N
|
||
+<U207F> "<U006E>"
|
||
+% SUBSCRIPT ZERO
|
||
+<U2080> "<U0030>"
|
||
+% SUBSCRIPT ONE
|
||
+<U2081> "<U0031>"
|
||
+% SUBSCRIPT TWO
|
||
+<U2082> "<U0032>"
|
||
+% SUBSCRIPT THREE
|
||
+<U2083> "<U0033>"
|
||
+% SUBSCRIPT FOUR
|
||
+<U2084> "<U0034>"
|
||
+% SUBSCRIPT FIVE
|
||
+<U2085> "<U0035>"
|
||
+% SUBSCRIPT SIX
|
||
+<U2086> "<U0036>"
|
||
+% SUBSCRIPT SEVEN
|
||
+<U2087> "<U0037>"
|
||
+% SUBSCRIPT EIGHT
|
||
+<U2088> "<U0038>"
|
||
+% SUBSCRIPT NINE
|
||
+<U2089> "<U0039>"
|
||
+% SUBSCRIPT PLUS SIGN
|
||
+<U208A> "<U002B>"
|
||
+% SUBSCRIPT MINUS
|
||
+<U208B> "<U2212>"
|
||
+% SUBSCRIPT EQUALS SIGN
|
||
+<U208C> "<U003D>"
|
||
+% SUBSCRIPT LEFT PARENTHESIS
|
||
+<U208D> "<U0028>"
|
||
+% SUBSCRIPT RIGHT PARENTHESIS
|
||
+<U208E> "<U0029>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER A
|
||
+<U2090> "<U0061>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER E
|
||
+<U2091> "<U0065>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER O
|
||
+<U2092> "<U006F>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER X
|
||
+<U2093> "<U0078>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER SCHWA
|
||
+<U2094> "<U0259>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER H
|
||
+<U2095> "<U0068>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER K
|
||
+<U2096> "<U006B>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER L
|
||
+<U2097> "<U006C>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER M
|
||
+<U2098> "<U006D>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER N
|
||
+<U2099> "<U006E>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER P
|
||
+<U209A> "<U0070>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER S
|
||
+<U209B> "<U0073>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER T
|
||
+<U209C> "<U0074>"
|
||
% RUPEE SIGN
|
||
<U20A8> "<U0052><U0073>"
|
||
% ACCOUNT OF
|
||
@@ -164,8 +498,12 @@ translit_start
|
||
<U2109> "<U00B0><U0046>"
|
||
% NUMERO SIGN
|
||
<U2116> "<U004E><U006F>"
|
||
+% SERVICE MARK
|
||
+<U2120> "<U0053><U004D>"
|
||
% TELEPHONE SIGN
|
||
<U2121> "<U0054><U0045><U004C>"
|
||
+% TRADE MARK SIGN
|
||
+<U2122> "<U0054><U004D>"
|
||
% ALEF SYMBOL
|
||
<U2135> "<U05D0>"
|
||
% BET SYMBOL
|
||
@@ -174,6 +512,8 @@ translit_start
|
||
<U2137> "<U05D2>"
|
||
% DALET SYMBOL
|
||
<U2138> "<U05D3>"
|
||
+% FACSIMILE SIGN
|
||
+<U213B> "<U0046><U0041><U0058>"
|
||
% ROMAN NUMERAL ONE
|
||
<U2160> "<U0049>"
|
||
% ROMAN NUMERAL TWO
|
||
@@ -386,6 +726,12 @@ translit_start
|
||
<U2A75> "<U003D><U003D>"
|
||
% THREE CONSECUTIVE EQUALS SIGNS
|
||
<U2A76> "<U003D><U003D><U003D>"
|
||
+% LATIN SUBSCRIPT SMALL LETTER J
|
||
+<U2C7C> "<U006A>"
|
||
+% MODIFIER LETTER CAPITAL V
|
||
+<U2C7D> "<U0056>"
|
||
+% TIFINAGH MODIFIER LETTER LABIALIZATION MARK
|
||
+<U2D6F> "<U2D61>"
|
||
% CJK RADICAL MOTHER
|
||
<U2E9F> "<U6BCD>"
|
||
% CJK RADICAL C-SIMPLIFIED TURTLE
|
||
@@ -830,6 +1176,10 @@ translit_start
|
||
<U309B> "<U0020><U3099>"
|
||
% KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||
<U309C> "<U0020><U309A>"
|
||
+% HIRAGANA DIGRAPH YORI
|
||
+<U309F> "<U3088><U308A>"
|
||
+% KATAKANA DIGRAPH KOTO
|
||
+<U30FF> "<U30B3><U30C8>"
|
||
% HANGUL LETTER KIYEOK
|
||
<U3131> "<U1100>"
|
||
% HANGUL LETTER SSANGKIYEOK
|
||
@@ -1018,6 +1368,34 @@ translit_start
|
||
<U318D> "<U119E>"
|
||
% HANGUL LETTER ARAEAE
|
||
<U318E> "<U11A1>"
|
||
+% IDEOGRAPHIC ANNOTATION ONE MARK
|
||
+<U3192> "<U4E00>"
|
||
+% IDEOGRAPHIC ANNOTATION TWO MARK
|
||
+<U3193> "<U4E8C>"
|
||
+% IDEOGRAPHIC ANNOTATION THREE MARK
|
||
+<U3194> "<U4E09>"
|
||
+% IDEOGRAPHIC ANNOTATION FOUR MARK
|
||
+<U3195> "<U56DB>"
|
||
+% IDEOGRAPHIC ANNOTATION TOP MARK
|
||
+<U3196> "<U4E0A>"
|
||
+% IDEOGRAPHIC ANNOTATION MIDDLE MARK
|
||
+<U3197> "<U4E2D>"
|
||
+% IDEOGRAPHIC ANNOTATION BOTTOM MARK
|
||
+<U3198> "<U4E0B>"
|
||
+% IDEOGRAPHIC ANNOTATION FIRST MARK
|
||
+<U3199> "<U7532>"
|
||
+% IDEOGRAPHIC ANNOTATION SECOND MARK
|
||
+<U319A> "<U4E59>"
|
||
+% IDEOGRAPHIC ANNOTATION THIRD MARK
|
||
+<U319B> "<U4E19>"
|
||
+% IDEOGRAPHIC ANNOTATION FOURTH MARK
|
||
+<U319C> "<U4E01>"
|
||
+% IDEOGRAPHIC ANNOTATION HEAVEN MARK
|
||
+<U319D> "<U5929>"
|
||
+% IDEOGRAPHIC ANNOTATION EARTH MARK
|
||
+<U319E> "<U5730>"
|
||
+% IDEOGRAPHIC ANNOTATION MAN MARK
|
||
+<U319F> "<U4EBA>"
|
||
% PARENTHESIZED HANGUL KIYEOK
|
||
<U3200> "<U0028><U1100><U0029>"
|
||
% PARENTHESIZED HANGUL NIEUN
|
||
@@ -1076,6 +1454,10 @@ translit_start
|
||
<U321B> "<U0028><U1112><U1161><U0029>"
|
||
% PARENTHESIZED HANGUL CIEUC U
|
||
<U321C> "<U0028><U110C><U116E><U0029>"
|
||
+% PARENTHESIZED KOREAN CHARACTER OJEON
|
||
+<U321D> "<U0028><U110B><U1169><U110C><U1165><U11AB><U0029>"
|
||
+% PARENTHESIZED KOREAN CHARACTER O HU
|
||
+<U321E> "<U0028><U110B><U1169><U1112><U116E><U0029>"
|
||
% PARENTHESIZED IDEOGRAPH ONE
|
||
<U3220> "<U0028><U4E00><U0029>"
|
||
% PARENTHESIZED IDEOGRAPH TWO
|
||
@@ -1284,6 +1666,24 @@ translit_start
|
||
<U33FD> "<U0033><U0030><U65E5>"
|
||
% IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
|
||
<U33FE> "<U0033><U0031><U65E5>"
|
||
+% MODIFIER LETTER CYRILLIC HARD SIGN
|
||
+<UA69C> "<U044A>"
|
||
+% MODIFIER LETTER CYRILLIC SOFT SIGN
|
||
+<UA69D> "<U044C>"
|
||
+% MODIFIER LETTER US
|
||
+<UA770> "<UA76F>"
|
||
+% MODIFIER LETTER CAPITAL H WITH STROKE
|
||
+<UA7F8> "<U0126>"
|
||
+% MODIFIER LETTER SMALL LIGATURE OE
|
||
+<UA7F9> "<U0153>"
|
||
+% MODIFIER LETTER SMALL HENG
|
||
+<UAB5C> "<UA727>"
|
||
+% MODIFIER LETTER SMALL L WITH INVERTED LAZY S
|
||
+<UAB5D> "<UAB37>"
|
||
+% MODIFIER LETTER SMALL L WITH MIDDLE TILDE
|
||
+<UAB5E> "<U026B>"
|
||
+% MODIFIER LETTER SMALL U WITH LEFT HOOK
|
||
+<UAB5F> "<UAB52>"
|
||
% LATIN SMALL LIGATURE FF
|
||
<UFB00> "<U0066><U0066>"
|
||
% LATIN SMALL LIGATURE FI
|
||
@@ -1295,7 +1695,7 @@ translit_start
|
||
% LATIN SMALL LIGATURE FFL
|
||
<UFB04> "<U0066><U0066><U006C>"
|
||
% LATIN SMALL LIGATURE LONG S T
|
||
-<UFB05> "<U017F><U0074>"
|
||
+<UFB05> "<U0073><U0074>"
|
||
% LATIN SMALL LIGATURE ST
|
||
<UFB06> "<U0073><U0074>"
|
||
% ARMENIAN SMALL LIGATURE MEN NOW
|
||
@@ -1310,6 +1710,72 @@ translit_start
|
||
<UFB17> "<U0574><U056D>"
|
||
% HEBREW LIGATURE ALEF LAMED
|
||
<UFB4F> "<U05D0><U05DC>"
|
||
+% PRESENTATION FORM FOR VERTICAL COMMA
|
||
+<UFE10> "<U002C>"
|
||
+% PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA
|
||
+<UFE11> "<U3001>"
|
||
+% PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP
|
||
+<UFE12> "<U3002>"
|
||
+% PRESENTATION FORM FOR VERTICAL COLON
|
||
+<UFE13> "<U003A>"
|
||
+% PRESENTATION FORM FOR VERTICAL SEMICOLON
|
||
+<UFE14> "<U003B>"
|
||
+% PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK
|
||
+<UFE15> "<U0021>"
|
||
+% PRESENTATION FORM FOR VERTICAL QUESTION MARK
|
||
+<UFE16> "<U003F>"
|
||
+% PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET
|
||
+<UFE17> "<U3016>"
|
||
+% PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET
|
||
+<UFE18> "<U3017>"
|
||
+% PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
|
||
+<UFE19> "<U002E><U002E><U002E>"
|
||
+% PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
|
||
+<UFE30> "<U002E><U002E>"
|
||
+% PRESENTATION FORM FOR VERTICAL EM DASH
|
||
+<UFE31> "<U2014>"
|
||
+% PRESENTATION FORM FOR VERTICAL EN DASH
|
||
+<UFE32> "<U2013>"
|
||
+% PRESENTATION FORM FOR VERTICAL LOW LINE
|
||
+<UFE33> "<U005F>"
|
||
+% PRESENTATION FORM FOR VERTICAL WAVY LOW LINE
|
||
+<UFE34> "<U005F>"
|
||
+% PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
|
||
+<UFE35> "<U0028>"
|
||
+% PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS
|
||
+<UFE36> "<U0029>"
|
||
+% PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET
|
||
+<UFE37> "<U007B>"
|
||
+% PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET
|
||
+<UFE38> "<U007D>"
|
||
+% PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET
|
||
+<UFE39> "<U3014>"
|
||
+% PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET
|
||
+<UFE3A> "<U3015>"
|
||
+% PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET
|
||
+<UFE3B> "<U3010>"
|
||
+% PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET
|
||
+<UFE3C> "<U3011>"
|
||
+% PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET
|
||
+<UFE3D> "<U300A>"
|
||
+% PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET
|
||
+<UFE3E> "<U300B>"
|
||
+% PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET
|
||
+<UFE3F> "<U3008>"
|
||
+% PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET
|
||
+<UFE40> "<U3009>"
|
||
+% PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
|
||
+<UFE41> "<U300C>"
|
||
+% PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
|
||
+<UFE42> "<U300D>"
|
||
+% PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
|
||
+<UFE43> "<U300E>"
|
||
+% PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
|
||
+<UFE44> "<U300F>"
|
||
+% PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET
|
||
+<UFE47> "<U005B>"
|
||
+% PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET
|
||
+<UFE48> "<U005D>"
|
||
% DASHED OVERLINE
|
||
<UFE49> "<U203E>"
|
||
% CENTRELINE OVERLINE
|
||
@@ -1324,6 +1790,104 @@ translit_start
|
||
<UFE4E> "<U005F>"
|
||
% WAVY LOW LINE
|
||
<UFE4F> "<U005F>"
|
||
+% DIGIT ZERO FULL STOP
|
||
+<U0001F100> "<U0030><U002E>"
|
||
+% DIGIT ZERO COMMA
|
||
+<U0001F101> "<U0030><U002C>"
|
||
+% DIGIT ONE COMMA
|
||
+<U0001F102> "<U0031><U002C>"
|
||
+% DIGIT TWO COMMA
|
||
+<U0001F103> "<U0032><U002C>"
|
||
+% DIGIT THREE COMMA
|
||
+<U0001F104> "<U0033><U002C>"
|
||
+% DIGIT FOUR COMMA
|
||
+<U0001F105> "<U0034><U002C>"
|
||
+% DIGIT FIVE COMMA
|
||
+<U0001F106> "<U0035><U002C>"
|
||
+% DIGIT SIX COMMA
|
||
+<U0001F107> "<U0036><U002C>"
|
||
+% DIGIT SEVEN COMMA
|
||
+<U0001F108> "<U0037><U002C>"
|
||
+% DIGIT EIGHT COMMA
|
||
+<U0001F109> "<U0038><U002C>"
|
||
+% DIGIT NINE COMMA
|
||
+<U0001F10A> "<U0039><U002C>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER A
|
||
+<U0001F110> "<U0028><U0041><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER B
|
||
+<U0001F111> "<U0028><U0042><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER C
|
||
+<U0001F112> "<U0028><U0043><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER D
|
||
+<U0001F113> "<U0028><U0044><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER E
|
||
+<U0001F114> "<U0028><U0045><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER F
|
||
+<U0001F115> "<U0028><U0046><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER G
|
||
+<U0001F116> "<U0028><U0047><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER H
|
||
+<U0001F117> "<U0028><U0048><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER I
|
||
+<U0001F118> "<U0028><U0049><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER J
|
||
+<U0001F119> "<U0028><U004A><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER K
|
||
+<U0001F11A> "<U0028><U004B><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER L
|
||
+<U0001F11B> "<U0028><U004C><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER M
|
||
+<U0001F11C> "<U0028><U004D><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER N
|
||
+<U0001F11D> "<U0028><U004E><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER O
|
||
+<U0001F11E> "<U0028><U004F><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER P
|
||
+<U0001F11F> "<U0028><U0050><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER Q
|
||
+<U0001F120> "<U0028><U0051><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER R
|
||
+<U0001F121> "<U0028><U0052><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER S
|
||
+<U0001F122> "<U0028><U0053><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER T
|
||
+<U0001F123> "<U0028><U0054><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER U
|
||
+<U0001F124> "<U0028><U0055><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER V
|
||
+<U0001F125> "<U0028><U0056><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER W
|
||
+<U0001F126> "<U0028><U0057><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER X
|
||
+<U0001F127> "<U0028><U0058><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER Y
|
||
+<U0001F128> "<U0028><U0059><U0029>"
|
||
+% PARENTHESIZED LATIN CAPITAL LETTER Z
|
||
+<U0001F129> "<U0028><U005A><U0029>"
|
||
+% TORTOISE SHELL BRACKETED LATIN CAPITAL LETTER S
|
||
+<U0001F12A> "<U3014><U0053><U3015>"
|
||
+% RAISED MC SIGN
|
||
+<U0001F16A> "<U004D><U0043>"
|
||
+% RAISED MD SIGN
|
||
+<U0001F16B> "<U004D><U0044>"
|
||
+% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C
|
||
+<U0001F240> "<U3014><U672C><U3015>"
|
||
+% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-4E09
|
||
+<U0001F241> "<U3014><U4E09><U3015>"
|
||
+% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-4E8C
|
||
+<U0001F242> "<U3014><U4E8C><U3015>"
|
||
+% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-5B89
|
||
+<U0001F243> "<U3014><U5B89><U3015>"
|
||
+% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-70B9
|
||
+<U0001F244> "<U3014><U70B9><U3015>"
|
||
+% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6253
|
||
+<U0001F245> "<U3014><U6253><U3015>"
|
||
+% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-76D7
|
||
+<U0001F246> "<U3014><U76D7><U3015>"
|
||
+% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-52DD
|
||
+<U0001F247> "<U3014><U52DD><U3015>"
|
||
+% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557
|
||
+<U0001F248> "<U3014><U6557><U3015>"
|
||
|
||
translit_end
|
||
|
||
diff --git a/localedata/locales/translit_font b/localedata/locales/translit_font
|
||
index 9347bd4..65e0d90 100644
|
||
--- a/localedata/locales/translit_font
|
||
+++ b/localedata/locales/translit_font
|
||
@@ -2,9 +2,7 @@ escape_char /
|
||
comment_char %
|
||
|
||
% Transliterations of font equivalents.
|
||
-% Generated through
|
||
-% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;<font>[^;]*;' UnicodeData.txt | \
|
||
-% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;<font> \([^;]*\);.*$/<U\1> <U\3> % \2/'
|
||
+% Generated automatically from UnicodeData.txt by gen_translit_font.py on 2015-06-10 for Unicode 7.0.0.
|
||
|
||
LC_CTYPE
|
||
|
||
@@ -37,6 +35,7 @@ translit_start
|
||
<U2133> <U004D> % SCRIPT CAPITAL M
|
||
<U2134> <U006F> % SCRIPT SMALL O
|
||
<U2139> <U0069> % INFORMATION SOURCE
|
||
+<U213C> <U03C0> % DOUBLE-STRUCK SMALL PI
|
||
<U213D> <U03B3> % DOUBLE-STRUCK SMALL GAMMA
|
||
<U213E> <U0393> % DOUBLE-STRUCK CAPITAL GAMMA
|
||
<U213F> <U03A0> % DOUBLE-STRUCK CAPITAL PI
|
||
@@ -238,6 +237,7 @@ translit_start
|
||
<U0001D4BE> <U0069> % MATHEMATICAL SCRIPT SMALL I
|
||
<U0001D4BF> <U006A> % MATHEMATICAL SCRIPT SMALL J
|
||
<U0001D4C0> <U006B> % MATHEMATICAL SCRIPT SMALL K
|
||
+<U0001D4C1> <U006C> % MATHEMATICAL SCRIPT SMALL L
|
||
<U0001D4C2> <U006D> % MATHEMATICAL SCRIPT SMALL M
|
||
<U0001D4C3> <U006E> % MATHEMATICAL SCRIPT SMALL N
|
||
<U0001D4C5> <U0070> % MATHEMATICAL SCRIPT SMALL P
|
||
@@ -707,6 +707,8 @@ translit_start
|
||
<U0001D6A1> <U0078> % MATHEMATICAL MONOSPACE SMALL X
|
||
<U0001D6A2> <U0079> % MATHEMATICAL MONOSPACE SMALL Y
|
||
<U0001D6A3> <U007A> % MATHEMATICAL MONOSPACE SMALL Z
|
||
+<U0001D6A4> <U0131> % MATHEMATICAL ITALIC SMALL DOTLESS I
|
||
+<U0001D6A5> <U0237> % MATHEMATICAL ITALIC SMALL DOTLESS J
|
||
<U0001D6A8> <U0391> % MATHEMATICAL BOLD CAPITAL ALPHA
|
||
<U0001D6A9> <U0392> % MATHEMATICAL BOLD CAPITAL BETA
|
||
<U0001D6AA> <U0393> % MATHEMATICAL BOLD CAPITAL GAMMA
|
||
@@ -997,6 +999,8 @@ translit_start
|
||
<U0001D7C7> <U03D5> % MATHEMATICAL SANS-SERIF BOLD ITALIC PHI SYMBOL
|
||
<U0001D7C8> <U03F1> % MATHEMATICAL SANS-SERIF BOLD ITALIC RHO SYMBOL
|
||
<U0001D7C9> <U03D6> % MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL
|
||
+<U0001D7CA> <U03DC> % MATHEMATICAL BOLD CAPITAL DIGAMMA
|
||
+<U0001D7CB> <U03DD> % MATHEMATICAL BOLD SMALL DIGAMMA
|
||
<U0001D7CE> <U0030> % MATHEMATICAL BOLD DIGIT ZERO
|
||
<U0001D7CF> <U0031> % MATHEMATICAL BOLD DIGIT ONE
|
||
<U0001D7D0> <U0032> % MATHEMATICAL BOLD DIGIT TWO
|
||
@@ -1047,6 +1051,147 @@ translit_start
|
||
<U0001D7FD> <U0037> % MATHEMATICAL MONOSPACE DIGIT SEVEN
|
||
<U0001D7FE> <U0038> % MATHEMATICAL MONOSPACE DIGIT EIGHT
|
||
<U0001D7FF> <U0039> % MATHEMATICAL MONOSPACE DIGIT NINE
|
||
+<U0001EE00> <U0627> % ARABIC MATHEMATICAL ALEF
|
||
+<U0001EE01> <U0628> % ARABIC MATHEMATICAL BEH
|
||
+<U0001EE02> <U062C> % ARABIC MATHEMATICAL JEEM
|
||
+<U0001EE03> <U062F> % ARABIC MATHEMATICAL DAL
|
||
+<U0001EE05> <U0648> % ARABIC MATHEMATICAL WAW
|
||
+<U0001EE06> <U0632> % ARABIC MATHEMATICAL ZAIN
|
||
+<U0001EE07> <U062D> % ARABIC MATHEMATICAL HAH
|
||
+<U0001EE08> <U0637> % ARABIC MATHEMATICAL TAH
|
||
+<U0001EE09> <U064A> % ARABIC MATHEMATICAL YEH
|
||
+<U0001EE0A> <U0643> % ARABIC MATHEMATICAL KAF
|
||
+<U0001EE0B> <U0644> % ARABIC MATHEMATICAL LAM
|
||
+<U0001EE0C> <U0645> % ARABIC MATHEMATICAL MEEM
|
||
+<U0001EE0D> <U0646> % ARABIC MATHEMATICAL NOON
|
||
+<U0001EE0E> <U0633> % ARABIC MATHEMATICAL SEEN
|
||
+<U0001EE0F> <U0639> % ARABIC MATHEMATICAL AIN
|
||
+<U0001EE10> <U0641> % ARABIC MATHEMATICAL FEH
|
||
+<U0001EE11> <U0635> % ARABIC MATHEMATICAL SAD
|
||
+<U0001EE12> <U0642> % ARABIC MATHEMATICAL QAF
|
||
+<U0001EE13> <U0631> % ARABIC MATHEMATICAL REH
|
||
+<U0001EE14> <U0634> % ARABIC MATHEMATICAL SHEEN
|
||
+<U0001EE15> <U062A> % ARABIC MATHEMATICAL TEH
|
||
+<U0001EE16> <U062B> % ARABIC MATHEMATICAL THEH
|
||
+<U0001EE17> <U062E> % ARABIC MATHEMATICAL KHAH
|
||
+<U0001EE18> <U0630> % ARABIC MATHEMATICAL THAL
|
||
+<U0001EE19> <U0636> % ARABIC MATHEMATICAL DAD
|
||
+<U0001EE1A> <U0638> % ARABIC MATHEMATICAL ZAH
|
||
+<U0001EE1B> <U063A> % ARABIC MATHEMATICAL GHAIN
|
||
+<U0001EE1C> <U066E> % ARABIC MATHEMATICAL DOTLESS BEH
|
||
+<U0001EE1D> <U06BA> % ARABIC MATHEMATICAL DOTLESS NOON
|
||
+<U0001EE1E> <U06A1> % ARABIC MATHEMATICAL DOTLESS FEH
|
||
+<U0001EE1F> <U066F> % ARABIC MATHEMATICAL DOTLESS QAF
|
||
+<U0001EE21> <U0628> % ARABIC MATHEMATICAL INITIAL BEH
|
||
+<U0001EE22> <U062C> % ARABIC MATHEMATICAL INITIAL JEEM
|
||
+<U0001EE24> <U0647> % ARABIC MATHEMATICAL INITIAL HEH
|
||
+<U0001EE27> <U062D> % ARABIC MATHEMATICAL INITIAL HAH
|
||
+<U0001EE29> <U064A> % ARABIC MATHEMATICAL INITIAL YEH
|
||
+<U0001EE2A> <U0643> % ARABIC MATHEMATICAL INITIAL KAF
|
||
+<U0001EE2B> <U0644> % ARABIC MATHEMATICAL INITIAL LAM
|
||
+<U0001EE2C> <U0645> % ARABIC MATHEMATICAL INITIAL MEEM
|
||
+<U0001EE2D> <U0646> % ARABIC MATHEMATICAL INITIAL NOON
|
||
+<U0001EE2E> <U0633> % ARABIC MATHEMATICAL INITIAL SEEN
|
||
+<U0001EE2F> <U0639> % ARABIC MATHEMATICAL INITIAL AIN
|
||
+<U0001EE30> <U0641> % ARABIC MATHEMATICAL INITIAL FEH
|
||
+<U0001EE31> <U0635> % ARABIC MATHEMATICAL INITIAL SAD
|
||
+<U0001EE32> <U0642> % ARABIC MATHEMATICAL INITIAL QAF
|
||
+<U0001EE34> <U0634> % ARABIC MATHEMATICAL INITIAL SHEEN
|
||
+<U0001EE35> <U062A> % ARABIC MATHEMATICAL INITIAL TEH
|
||
+<U0001EE36> <U062B> % ARABIC MATHEMATICAL INITIAL THEH
|
||
+<U0001EE37> <U062E> % ARABIC MATHEMATICAL INITIAL KHAH
|
||
+<U0001EE39> <U0636> % ARABIC MATHEMATICAL INITIAL DAD
|
||
+<U0001EE3B> <U063A> % ARABIC MATHEMATICAL INITIAL GHAIN
|
||
+<U0001EE42> <U062C> % ARABIC MATHEMATICAL TAILED JEEM
|
||
+<U0001EE47> <U062D> % ARABIC MATHEMATICAL TAILED HAH
|
||
+<U0001EE49> <U064A> % ARABIC MATHEMATICAL TAILED YEH
|
||
+<U0001EE4B> <U0644> % ARABIC MATHEMATICAL TAILED LAM
|
||
+<U0001EE4D> <U0646> % ARABIC MATHEMATICAL TAILED NOON
|
||
+<U0001EE4E> <U0633> % ARABIC MATHEMATICAL TAILED SEEN
|
||
+<U0001EE4F> <U0639> % ARABIC MATHEMATICAL TAILED AIN
|
||
+<U0001EE51> <U0635> % ARABIC MATHEMATICAL TAILED SAD
|
||
+<U0001EE52> <U0642> % ARABIC MATHEMATICAL TAILED QAF
|
||
+<U0001EE54> <U0634> % ARABIC MATHEMATICAL TAILED SHEEN
|
||
+<U0001EE57> <U062E> % ARABIC MATHEMATICAL TAILED KHAH
|
||
+<U0001EE59> <U0636> % ARABIC MATHEMATICAL TAILED DAD
|
||
+<U0001EE5B> <U063A> % ARABIC MATHEMATICAL TAILED GHAIN
|
||
+<U0001EE5D> <U06BA> % ARABIC MATHEMATICAL TAILED DOTLESS NOON
|
||
+<U0001EE5F> <U066F> % ARABIC MATHEMATICAL TAILED DOTLESS QAF
|
||
+<U0001EE61> <U0628> % ARABIC MATHEMATICAL STRETCHED BEH
|
||
+<U0001EE62> <U062C> % ARABIC MATHEMATICAL STRETCHED JEEM
|
||
+<U0001EE64> <U0647> % ARABIC MATHEMATICAL STRETCHED HEH
|
||
+<U0001EE67> <U062D> % ARABIC MATHEMATICAL STRETCHED HAH
|
||
+<U0001EE68> <U0637> % ARABIC MATHEMATICAL STRETCHED TAH
|
||
+<U0001EE69> <U064A> % ARABIC MATHEMATICAL STRETCHED YEH
|
||
+<U0001EE6A> <U0643> % ARABIC MATHEMATICAL STRETCHED KAF
|
||
+<U0001EE6C> <U0645> % ARABIC MATHEMATICAL STRETCHED MEEM
|
||
+<U0001EE6D> <U0646> % ARABIC MATHEMATICAL STRETCHED NOON
|
||
+<U0001EE6E> <U0633> % ARABIC MATHEMATICAL STRETCHED SEEN
|
||
+<U0001EE6F> <U0639> % ARABIC MATHEMATICAL STRETCHED AIN
|
||
+<U0001EE70> <U0641> % ARABIC MATHEMATICAL STRETCHED FEH
|
||
+<U0001EE71> <U0635> % ARABIC MATHEMATICAL STRETCHED SAD
|
||
+<U0001EE72> <U0642> % ARABIC MATHEMATICAL STRETCHED QAF
|
||
+<U0001EE74> <U0634> % ARABIC MATHEMATICAL STRETCHED SHEEN
|
||
+<U0001EE75> <U062A> % ARABIC MATHEMATICAL STRETCHED TEH
|
||
+<U0001EE76> <U062B> % ARABIC MATHEMATICAL STRETCHED THEH
|
||
+<U0001EE77> <U062E> % ARABIC MATHEMATICAL STRETCHED KHAH
|
||
+<U0001EE79> <U0636> % ARABIC MATHEMATICAL STRETCHED DAD
|
||
+<U0001EE7A> <U0638> % ARABIC MATHEMATICAL STRETCHED ZAH
|
||
+<U0001EE7B> <U063A> % ARABIC MATHEMATICAL STRETCHED GHAIN
|
||
+<U0001EE7C> <U066E> % ARABIC MATHEMATICAL STRETCHED DOTLESS BEH
|
||
+<U0001EE7E> <U06A1> % ARABIC MATHEMATICAL STRETCHED DOTLESS FEH
|
||
+<U0001EE80> <U0627> % ARABIC MATHEMATICAL LOOPED ALEF
|
||
+<U0001EE81> <U0628> % ARABIC MATHEMATICAL LOOPED BEH
|
||
+<U0001EE82> <U062C> % ARABIC MATHEMATICAL LOOPED JEEM
|
||
+<U0001EE83> <U062F> % ARABIC MATHEMATICAL LOOPED DAL
|
||
+<U0001EE84> <U0647> % ARABIC MATHEMATICAL LOOPED HEH
|
||
+<U0001EE85> <U0648> % ARABIC MATHEMATICAL LOOPED WAW
|
||
+<U0001EE86> <U0632> % ARABIC MATHEMATICAL LOOPED ZAIN
|
||
+<U0001EE87> <U062D> % ARABIC MATHEMATICAL LOOPED HAH
|
||
+<U0001EE88> <U0637> % ARABIC MATHEMATICAL LOOPED TAH
|
||
+<U0001EE89> <U064A> % ARABIC MATHEMATICAL LOOPED YEH
|
||
+<U0001EE8B> <U0644> % ARABIC MATHEMATICAL LOOPED LAM
|
||
+<U0001EE8C> <U0645> % ARABIC MATHEMATICAL LOOPED MEEM
|
||
+<U0001EE8D> <U0646> % ARABIC MATHEMATICAL LOOPED NOON
|
||
+<U0001EE8E> <U0633> % ARABIC MATHEMATICAL LOOPED SEEN
|
||
+<U0001EE8F> <U0639> % ARABIC MATHEMATICAL LOOPED AIN
|
||
+<U0001EE90> <U0641> % ARABIC MATHEMATICAL LOOPED FEH
|
||
+<U0001EE91> <U0635> % ARABIC MATHEMATICAL LOOPED SAD
|
||
+<U0001EE92> <U0642> % ARABIC MATHEMATICAL LOOPED QAF
|
||
+<U0001EE93> <U0631> % ARABIC MATHEMATICAL LOOPED REH
|
||
+<U0001EE94> <U0634> % ARABIC MATHEMATICAL LOOPED SHEEN
|
||
+<U0001EE95> <U062A> % ARABIC MATHEMATICAL LOOPED TEH
|
||
+<U0001EE96> <U062B> % ARABIC MATHEMATICAL LOOPED THEH
|
||
+<U0001EE97> <U062E> % ARABIC MATHEMATICAL LOOPED KHAH
|
||
+<U0001EE98> <U0630> % ARABIC MATHEMATICAL LOOPED THAL
|
||
+<U0001EE99> <U0636> % ARABIC MATHEMATICAL LOOPED DAD
|
||
+<U0001EE9A> <U0638> % ARABIC MATHEMATICAL LOOPED ZAH
|
||
+<U0001EE9B> <U063A> % ARABIC MATHEMATICAL LOOPED GHAIN
|
||
+<U0001EEA1> <U0628> % ARABIC MATHEMATICAL DOUBLE-STRUCK BEH
|
||
+<U0001EEA2> <U062C> % ARABIC MATHEMATICAL DOUBLE-STRUCK JEEM
|
||
+<U0001EEA3> <U062F> % ARABIC MATHEMATICAL DOUBLE-STRUCK DAL
|
||
+<U0001EEA5> <U0648> % ARABIC MATHEMATICAL DOUBLE-STRUCK WAW
|
||
+<U0001EEA6> <U0632> % ARABIC MATHEMATICAL DOUBLE-STRUCK ZAIN
|
||
+<U0001EEA7> <U062D> % ARABIC MATHEMATICAL DOUBLE-STRUCK HAH
|
||
+<U0001EEA8> <U0637> % ARABIC MATHEMATICAL DOUBLE-STRUCK TAH
|
||
+<U0001EEA9> <U064A> % ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
|
||
+<U0001EEAB> <U0644> % ARABIC MATHEMATICAL DOUBLE-STRUCK LAM
|
||
+<U0001EEAC> <U0645> % ARABIC MATHEMATICAL DOUBLE-STRUCK MEEM
|
||
+<U0001EEAD> <U0646> % ARABIC MATHEMATICAL DOUBLE-STRUCK NOON
|
||
+<U0001EEAE> <U0633> % ARABIC MATHEMATICAL DOUBLE-STRUCK SEEN
|
||
+<U0001EEAF> <U0639> % ARABIC MATHEMATICAL DOUBLE-STRUCK AIN
|
||
+<U0001EEB0> <U0641> % ARABIC MATHEMATICAL DOUBLE-STRUCK FEH
|
||
+<U0001EEB1> <U0635> % ARABIC MATHEMATICAL DOUBLE-STRUCK SAD
|
||
+<U0001EEB2> <U0642> % ARABIC MATHEMATICAL DOUBLE-STRUCK QAF
|
||
+<U0001EEB3> <U0631> % ARABIC MATHEMATICAL DOUBLE-STRUCK REH
|
||
+<U0001EEB4> <U0634> % ARABIC MATHEMATICAL DOUBLE-STRUCK SHEEN
|
||
+<U0001EEB5> <U062A> % ARABIC MATHEMATICAL DOUBLE-STRUCK TEH
|
||
+<U0001EEB6> <U062B> % ARABIC MATHEMATICAL DOUBLE-STRUCK THEH
|
||
+<U0001EEB7> <U062E> % ARABIC MATHEMATICAL DOUBLE-STRUCK KHAH
|
||
+<U0001EEB8> <U0630> % ARABIC MATHEMATICAL DOUBLE-STRUCK THAL
|
||
+<U0001EEB9> <U0636> % ARABIC MATHEMATICAL DOUBLE-STRUCK DAD
|
||
+<U0001EEBA> <U0638> % ARABIC MATHEMATICAL DOUBLE-STRUCK ZAH
|
||
+<U0001EEBB> <U063A> % ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
|
||
|
||
translit_end
|
||
|
||
diff --git a/localedata/locales/translit_fraction b/localedata/locales/translit_fraction
|
||
index 50dbd78..30f2843 100644
|
||
--- a/localedata/locales/translit_fraction
|
||
+++ b/localedata/locales/translit_fraction
|
||
@@ -2,10 +2,7 @@ escape_char /
|
||
comment_char %
|
||
|
||
% Transliterations of fractions.
|
||
-% Generated through
|
||
-% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;<fraction>[^;]*;' UnicodeData.txt | \
|
||
-% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;<fraction> \([^;]*\);.*$/<U\1> "<U\3>"% \2/' -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1><U\2/g' -e 'x' -e 's/^[^%]*\(% .*\)$/\1/' -e 'G'
|
||
-%
|
||
+% Generated automatically from UnicodeData.txt by gen_translit_fraction.py on 2015-06-10 for Unicode 7.0.0.
|
||
% The replacements have been surrounded with spaces, because fractions are
|
||
% often preceded by a decimal number and followed by a unit or a math symbol.
|
||
|
||
@@ -19,6 +16,12 @@ translit_start
|
||
<U00BD> "<U0020><U0031><U2044><U0032><U0020>";"<U0020><U0031><U002F><U0032><U0020>"
|
||
% VULGAR FRACTION THREE QUARTERS
|
||
<U00BE> "<U0020><U0033><U2044><U0034><U0020>";"<U0020><U0033><U002F><U0034><U0020>"
|
||
+% VULGAR FRACTION ONE SEVENTH
|
||
+<U2150> "<U0020><U0031><U2044><U0037><U0020>";"<U0020><U0031><U002F><U0037><U0020>"
|
||
+% VULGAR FRACTION ONE NINTH
|
||
+<U2151> "<U0020><U0031><U2044><U0039><U0020>";"<U0020><U0031><U002F><U0039><U0020>"
|
||
+% VULGAR FRACTION ONE TENTH
|
||
+<U2152> "<U0020><U0031><U2044><U0031><U0030><U0020>";"<U0020><U0031><U002F><U0031><U0030><U0020>"
|
||
% VULGAR FRACTION ONE THIRD
|
||
<U2153> "<U0020><U0031><U2044><U0033><U0020>";"<U0020><U0031><U002F><U0033><U0020>"
|
||
% VULGAR FRACTION TWO THIRDS
|
||
@@ -44,7 +47,9 @@ translit_start
|
||
% VULGAR FRACTION SEVEN EIGHTHS
|
||
<U215E> "<U0020><U0037><U2044><U0038><U0020>";"<U0020><U0037><U002F><U0038><U0020>"
|
||
% FRACTION NUMERATOR ONE
|
||
-<U215F> "<U0020><U0031><U2044>";"<U0020><U0031><U002F>"
|
||
+<U215F> "<U0020><U0031><U2044><U0020>";"<U0020><U0031><U002F><U0020>"
|
||
+% VULGAR FRACTION ZERO THIRDS
|
||
+<U2189> "<U0020><U0030><U2044><U0033><U0020>";"<U0020><U0030><U002F><U0033><U0020>"
|
||
|
||
translit_end
|
||
|
||
diff --git a/localedata/unicode-gen/Makefile b/localedata/unicode-gen/Makefile
|
||
index 166ee31..920bf0e 100644
|
||
--- a/localedata/unicode-gen/Makefile
|
||
+++ b/localedata/unicode-gen/Makefile
|
||
@@ -41,7 +41,7 @@ PYTHON3 = python3
|
||
WGET = wget
|
||
|
||
DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt
|
||
-GENERATED = i18n UTF-8
|
||
+GENERATED = i18n UTF-8 translit_combining translit_compat translit_circle translit_cjk_compat translit_font translit_fraction
|
||
REPORTS = i18n-report UTF-8-report
|
||
|
||
all: $(GENERATED)
|
||
@@ -51,6 +51,12 @@ check: check-i18n check-UTF-8
|
||
install:
|
||
cp -p i18n ../locales/i18n
|
||
cp -p UTF-8 ../charmaps/UTF-8
|
||
+ cp -p translit_combining ../locales/translit_combining
|
||
+ cp -p translit_compat ../locales/translit_compat
|
||
+ cp -p translit_circle ../locales/translit_circle
|
||
+ cp -p translit_cjk_compat ../locales/translit_cjk_compat
|
||
+ cp -p translit_font ../locales/translit_font
|
||
+ cp -p translit_fraction ../locales/translit_fraction
|
||
|
||
clean: mostlyclean
|
||
-rm -rf __pycache__
|
||
@@ -82,13 +88,43 @@ UTF-8: utf8_gen.py
|
||
|
||
UTF-8-report: UTF-8 ../charmaps/UTF-8
|
||
UTF-8-report: utf8_compatibility.py
|
||
- $(PYTHON3) ./utf8_compatibility.py -o ../charmaps/UTF-8 \
|
||
- -n UTF-8 -a -m > $@
|
||
+ $(PYTHON3) ./utf8_compatibility.py -u UnicodeData.txt \
|
||
+ -e EastAsianWidth.txt -o ../charmaps/UTF-8 \
|
||
+ -n UTF-8 -a -m -c > $@
|
||
|
||
check-UTF-8: UTF-8-report
|
||
@if grep '^Total.*: [^0]' UTF-8-report; \
|
||
then echo manual verification required; false; else true; fi
|
||
|
||
+translit_combining: UnicodeData.txt
|
||
+translit_combining: gen_translit_combining.py
|
||
+ $(PYTHON3) ./gen_translit_combining.py -u UnicodeData.txt \
|
||
+ -o $@ --unicode_version $(UNICODE_VERSION)
|
||
+
|
||
+translit_compat: UnicodeData.txt
|
||
+translit_compat: gen_translit_compat.py
|
||
+ $(PYTHON3) ./gen_translit_compat.py -u UnicodeData.txt \
|
||
+ -o $@ --unicode_version $(UNICODE_VERSION)
|
||
+
|
||
+translit_circle: UnicodeData.txt
|
||
+translit_circle: gen_translit_circle.py
|
||
+ $(PYTHON3) ./gen_translit_circle.py -u UnicodeData.txt \
|
||
+ -o $@ --unicode_version $(UNICODE_VERSION)
|
||
+
|
||
+translit_cjk_compat: UnicodeData.txt
|
||
+translit_cjk_compat: gen_translit_cjk_compat.py
|
||
+ $(PYTHON3) ./gen_translit_cjk_compat.py -u UnicodeData.txt \
|
||
+ -o $@ --unicode_version $(UNICODE_VERSION)
|
||
+
|
||
+translit_font: UnicodeData.txt
|
||
+translit_font: gen_translit_font.py
|
||
+ $(PYTHON3) ./gen_translit_font.py -u UnicodeData.txt \
|
||
+ -o $@ --unicode_version $(UNICODE_VERSION)
|
||
+
|
||
+translit_fraction: UnicodeData.txt
|
||
+translit_fraction: gen_translit_fraction.py
|
||
+ $(PYTHON3) ./gen_translit_fraction.py -u UnicodeData.txt \
|
||
+ -o $@ --unicode_version $(UNICODE_VERSION)
|
||
|
||
.PHONY: downloads clean-downloads
|
||
downloads: $(DOWNLOADS)
|
||
diff --git a/localedata/unicode-gen/gen_translit_circle.py b/localedata/unicode-gen/gen_translit_circle.py
|
||
new file mode 100755
|
||
index 0000000..6142859
|
||
--- /dev/null
|
||
+++ b/localedata/unicode-gen/gen_translit_circle.py
|
||
@@ -0,0 +1,150 @@
|
||
+#!/usr/bin/python3
|
||
+# -*- coding: utf-8 -*-
|
||
+#
|
||
+# Generate a translit_circle file from a UnicodeData file.
|
||
+# Copyright (C) 2015 Free Software Foundation, Inc.
|
||
+# This file is part of the GNU C Library.
|
||
+#
|
||
+# The GNU C Library is free software; you can redistribute it and/or
|
||
+# modify it under the terms of the GNU Lesser General Public
|
||
+# License as published by the Free Software Foundation; either
|
||
+# version 2.1 of the License, or (at your option) any later version.
|
||
+#
|
||
+# The GNU C Library is distributed in the hope that it will be useful,
|
||
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
+# Lesser General Public License for more details.
|
||
+#
|
||
+# You should have received a copy of the GNU Lesser General Public
|
||
+# License along with the GNU C Library; if not, see
|
||
+# <http://www.gnu.org/licenses/>.
|
||
+
|
||
+'''
|
||
+Generate a translit_circle file from UnicodeData.txt
|
||
+
|
||
+To see how this script is used, call it with the “-h” option:
|
||
+
|
||
+ $ ./gen_translit_circle -h
|
||
+ … prints usage message …
|
||
+'''
|
||
+
|
||
+import argparse
|
||
+import time
|
||
+import unicode_utils
|
||
+
|
||
+def read_input_file(filename):
|
||
+ '''Reads the original glibc translit_circle file to get the
|
||
+ original head and tail.
|
||
+
|
||
+ We want to replace only the part of the file between
|
||
+ “translit_start” and “translit_end”
|
||
+ '''
|
||
+ head = tail = ''
|
||
+ with open(filename, mode='r') as translit_file:
|
||
+ for line in translit_file:
|
||
+ head = head + line
|
||
+ if line.startswith('translit_start'):
|
||
+ break
|
||
+ for line in translit_file:
|
||
+ if line.startswith('translit_end'):
|
||
+ tail = line
|
||
+ break
|
||
+ for line in translit_file:
|
||
+ tail = tail + line
|
||
+ return (head, tail)
|
||
+
|
||
+def output_head(translit_file, unicode_version, head=''):
|
||
+ '''Write the header of the output file, i.e. the part of the file
|
||
+ before the “translit_start” line.
|
||
+ '''
|
||
+ if ARGS.input_file and head:
|
||
+ translit_file.write(head)
|
||
+ else:
|
||
+ translit_file.write('escape_char /\n')
|
||
+ translit_file.write('comment_char %\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('% Transliterations of encircled characters.\n')
|
||
+ translit_file.write('% Generated automatically from UnicodeData.txt '
|
||
+ + 'by gen_translit_circle.py '
|
||
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
|
||
+ + 'for Unicode {:s}.\n'.format(unicode_version))
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('LC_CTYPE\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('translit_start\n')
|
||
+
|
||
+def output_tail(translit_file, tail=''):
|
||
+ '''Write the tail of the output file'''
|
||
+ if ARGS.input_file and tail:
|
||
+ translit_file.write(tail)
|
||
+ else:
|
||
+ translit_file.write('translit_end\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('END LC_CTYPE\n')
|
||
+
|
||
+def output_transliteration(translit_file):
|
||
+ '''Write the new transliteration to the output file'''
|
||
+ translit_file.write('\n')
|
||
+ for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
|
||
+ name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ decomposition = unicode_utils.UNICODE_ATTRIBUTES[
|
||
+ code_point]['decomposition']
|
||
+ if decomposition.startswith('<circle>'):
|
||
+ decomposition = decomposition[9:]
|
||
+ decomposed_code_points = [int(x, 16)
|
||
+ for x in decomposition.split(' ')]
|
||
+ translit_file.write('% {:s}\n'.format(name))
|
||
+ translit_file.write('{:s} "<U0028>'.format(
|
||
+ unicode_utils.ucs_symbol(code_point)))
|
||
+ for decomposed_code_point in decomposed_code_points:
|
||
+ translit_file.write('{:s}'.format(
|
||
+ unicode_utils.ucs_symbol(decomposed_code_point)))
|
||
+ translit_file.write('<U0029>"\n')
|
||
+ translit_file.write('\n')
|
||
+
|
||
+
|
||
+if __name__ == "__main__":
|
||
+ PARSER = argparse.ArgumentParser(
|
||
+ description='''
|
||
+ Generate a translit_circle file from UnicodeData.txt.
|
||
+ ''')
|
||
+ PARSER.add_argument(
|
||
+ '-u', '--unicode_data_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ default='UnicodeData.txt',
|
||
+ help=('The UnicodeData.txt file to read, '
|
||
+ + 'default: %(default)s'))
|
||
+ PARSER.add_argument(
|
||
+ '-i', '--input_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ help=''' The original glibc/localedata/locales/translit_combining
|
||
+ file.''')
|
||
+ PARSER.add_argument(
|
||
+ '-o', '--output_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ default='translit_circle.new',
|
||
+ help='''The new translit_circle file, default: %(default)s. If the
|
||
+ original glibc/localedata/locales/translit_circle file has
|
||
+ been given as an option, the header up to the
|
||
+ “translit_start” line and the tail from the “translit_end”
|
||
+ line to the end of the file will be copied unchanged into the
|
||
+ output file. ''')
|
||
+ PARSER.add_argument(
|
||
+ '--unicode_version',
|
||
+ nargs='?',
|
||
+ required=True,
|
||
+ type=str,
|
||
+ help='The Unicode version of the input files used.')
|
||
+ ARGS = PARSER.parse_args()
|
||
+
|
||
+ unicode_utils.fill_attributes(ARGS.unicode_data_file)
|
||
+ HEAD = TAIL = ''
|
||
+ if ARGS.input_file:
|
||
+ (HEAD, TAIL) = read_input_file(ARGS.input_file)
|
||
+ with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
|
||
+ output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
|
||
+ output_transliteration(TRANSLIT_FILE)
|
||
+ output_tail(TRANSLIT_FILE, tail=TAIL)
|
||
diff --git a/localedata/unicode-gen/gen_translit_cjk_compat.py b/localedata/unicode-gen/gen_translit_cjk_compat.py
|
||
new file mode 100755
|
||
index 0000000..627ff6b
|
||
--- /dev/null
|
||
+++ b/localedata/unicode-gen/gen_translit_cjk_compat.py
|
||
@@ -0,0 +1,220 @@
|
||
+#!/usr/bin/python3
|
||
+# -*- coding: utf-8 -*-
|
||
+#
|
||
+# Generate a translit_cjk_compat file from a UnicodeData file.
|
||
+# Copyright (C) 2015 Free Software Foundation, Inc.
|
||
+# This file is part of the GNU C Library.
|
||
+#
|
||
+# The GNU C Library is free software; you can redistribute it and/or
|
||
+# modify it under the terms of the GNU Lesser General Public
|
||
+# License as published by the Free Software Foundation; either
|
||
+# version 2.1 of the License, or (at your option) any later version.
|
||
+#
|
||
+# The GNU C Library is distributed in the hope that it will be useful,
|
||
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
+# Lesser General Public License for more details.
|
||
+#
|
||
+# You should have received a copy of the GNU Lesser General Public
|
||
+# License along with the GNU C Library; if not, see
|
||
+# <http://www.gnu.org/licenses/>.
|
||
+
|
||
+'''
|
||
+Generate a translit_cjk_compat file from UnicodeData.txt
|
||
+
|
||
+To see how this script is used, call it with the “-h” option:
|
||
+
|
||
+ $ ./gen_translit_cjk_compat -h
|
||
+ … prints usage message …
|
||
+'''
|
||
+
|
||
+import argparse
|
||
+import time
|
||
+import sys
|
||
+import unicode_utils
|
||
+
|
||
+def read_input_file(filename):
|
||
+ '''Reads the original glibc translit_cjk_compat file to get the
|
||
+ original head and tail.
|
||
+
|
||
+ We want to replace only the part of the file between
|
||
+ “translit_start” and “translit_end”
|
||
+ '''
|
||
+ head = tail = ''
|
||
+ with open(filename, mode='r') as translit_file:
|
||
+ for line in translit_file:
|
||
+ head = head + line
|
||
+ if line.startswith('translit_start'):
|
||
+ break
|
||
+ for line in translit_file:
|
||
+ if line.startswith('translit_end'):
|
||
+ tail = line
|
||
+ break
|
||
+ for line in translit_file:
|
||
+ tail = tail + line
|
||
+ return (head, tail)
|
||
+
|
||
+def output_head(translit_file, unicode_version, head=''):
|
||
+ '''Write the header of the output file, i.e. the part of the file
|
||
+ before the “translit_start” line.
|
||
+ '''
|
||
+ if ARGS.input_file and head:
|
||
+ translit_file.write(head)
|
||
+ else:
|
||
+ translit_file.write('escape_char /\n')
|
||
+ translit_file.write('comment_char %\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('% Transliterations of CJK compatibility ')
|
||
+ translit_file.write('characters.\n')
|
||
+ translit_file.write('% Generated automatically from UnicodeData.txt '
|
||
+ + 'by gen_translit_cjk_compat.py '
|
||
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
|
||
+ + 'for Unicode {:s}.\n'.format(unicode_version))
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('LC_CTYPE\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('translit_start\n')
|
||
+
|
||
+def output_tail(translit_file, tail=''):
|
||
+ '''Write the tail of the output file'''
|
||
+ if ARGS.input_file and tail:
|
||
+ translit_file.write(tail)
|
||
+ else:
|
||
+ translit_file.write('translit_end\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('END LC_CTYPE\n')
|
||
+
|
||
+def special_decompose(code_point_list):
|
||
+ '''
|
||
+ Decompositions which are not in UnicodeData.txt at all but which
|
||
+ were used in the original translit_cjk_compat file in glibc and
|
||
+ which seem to make sense. I want to keep the update of
|
||
+ translit_cjk_compat close to the spirit of the original file,
|
||
+ therefore I added this special decomposition rules here.
|
||
+ '''
|
||
+ special_decompose_dict = {
|
||
+ (0x2215,): [0x002F], # ∕ → /
|
||
+ (0x00B2,): [0x005E, 0x0032], # ² → ^2
|
||
+ (0x03BC,): [0x00B5], # μ → µ (GREEK SMALL LETTER MU → MICRO SIGN)
|
||
+ (0x2113,): [0x006C], # ℓ → l
|
||
+ (0x00B3,): [0x005E, 0x0033], # ³ → ^3
|
||
+ (0x00B5,): [0x0075], # µ → u
|
||
+ (0x03BC, 0x2113): [0x03BC, 0x006C], # μℓ → μl
|
||
+ (0x0072, 0x0061, 0x0064, 0x2215, 0x0073, 0x00B2): [
|
||
+ 0x0072, 0x0061, 0x0064, 0x002F, 0x0073, 0x00B2],
|
||
+ (0x006D, 0x2215, 0x0073, 0x00B2): [0x006D, 0x002F, 0x0073, 0x00B2],
|
||
+ }
|
||
+ if tuple(code_point_list) in special_decompose_dict:
|
||
+ return special_decompose_dict[tuple(code_point_list)]
|
||
+ else:
|
||
+ return code_point_list
|
||
+
|
||
+def output_transliteration(translit_file):
|
||
+ '''Write the new transliteration to the output file'''
|
||
+ translit_file.write('\n')
|
||
+ for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
|
||
+ name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ decomposition = unicode_utils.UNICODE_ATTRIBUTES[
|
||
+ code_point]['decomposition']
|
||
+ if decomposition.startswith('<square>'):
|
||
+ decomposition = decomposition[9:]
|
||
+ decomposed_code_points = [[int(x, 16)
|
||
+ for x in decomposition.split(' ')]]
|
||
+ if decomposed_code_points[0]:
|
||
+ while True:
|
||
+ special_decomposed_code_points = special_decompose(
|
||
+ decomposed_code_points[-1])
|
||
+ if (special_decomposed_code_points
|
||
+ != decomposed_code_points[-1]):
|
||
+ decomposed_code_points.append(
|
||
+ special_decomposed_code_points)
|
||
+ continue
|
||
+ special_decomposed_code_points = []
|
||
+ for decomposed_code_point in decomposed_code_points[-1]:
|
||
+ special_decomposed_code_points += special_decompose(
|
||
+ [decomposed_code_point])
|
||
+ if (special_decomposed_code_points
|
||
+ == decomposed_code_points[-1]):
|
||
+ break
|
||
+ decomposed_code_points.append(
|
||
+ special_decomposed_code_points)
|
||
+ translit_file.write('% {:s}\n'.format(name))
|
||
+ translit_file.write('{:s} '.format(
|
||
+ unicode_utils.ucs_symbol(code_point)))
|
||
+ for index in range(0, len(decomposed_code_points)):
|
||
+ if index > 0:
|
||
+ translit_file.write(';')
|
||
+ if len(decomposed_code_points[index]) > 1:
|
||
+ translit_file.write('"')
|
||
+ for decomposed_code_point in decomposed_code_points[index]:
|
||
+ translit_file.write('{:s}'.format(
|
||
+ unicode_utils.ucs_symbol(decomposed_code_point)))
|
||
+ if len(decomposed_code_points[index]) > 1:
|
||
+ translit_file.write('"')
|
||
+ translit_file.write('\n')
|
||
+ for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
|
||
+ name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ decomposition = unicode_utils.UNICODE_ATTRIBUTES[
|
||
+ code_point]['decomposition']
|
||
+ if decomposition and name.startswith('CJK COMPATIBILITY IDEOGRAPH'):
|
||
+ decomposed_code_points = [int(x, 16)
|
||
+ for x in decomposition.split(' ')]
|
||
+ if len(decomposed_code_points) != 1:
|
||
+ sys.stderr.write(
|
||
+ 'Unexpected decomposition length {:x} {:s} {:s}\n'.format(
|
||
+ code_point, name, decomposition))
|
||
+ exit(1)
|
||
+ translit_file.write('% {:s}\n'.format(name))
|
||
+ translit_file.write('{:s} '.format(
|
||
+ unicode_utils.ucs_symbol(code_point)))
|
||
+ for decomposed_code_point in decomposed_code_points:
|
||
+ translit_file.write('{:s}'.format(
|
||
+ unicode_utils.ucs_symbol(decomposed_code_point)))
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('\n')
|
||
+
|
||
+if __name__ == "__main__":
|
||
+ PARSER = argparse.ArgumentParser(
|
||
+ description='''
|
||
+ Generate a translit_cjk_compat file from UnicodeData.txt.
|
||
+ ''')
|
||
+ PARSER.add_argument(
|
||
+ '-u', '--unicode_data_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ default='UnicodeData.txt',
|
||
+ help=('The UnicodeData.txt file to read, '
|
||
+ + 'default: %(default)s'))
|
||
+ PARSER.add_argument(
|
||
+ '-i', '--input_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ help=''' The original glibc/localedata/locales/translit_cjk_compat
|
||
+ file.''')
|
||
+ PARSER.add_argument(
|
||
+ '-o', '--output_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ default='translit_cjk_compat.new',
|
||
+ help='''The new translit_cjk_compat file, default: %(default)s. If the
|
||
+ original glibc/localedata/locales/translit_cjk_compat file has
|
||
+ been given as an option, the header up to the
|
||
+ “translit_start” line and the tail from the “translit_end”
|
||
+ line to the end of the file will be copied unchanged into the
|
||
+ output file. ''')
|
||
+ PARSER.add_argument(
|
||
+ '--unicode_version',
|
||
+ nargs='?',
|
||
+ required=True,
|
||
+ type=str,
|
||
+ help='The Unicode version of the input files used.')
|
||
+ ARGS = PARSER.parse_args()
|
||
+
|
||
+ unicode_utils.fill_attributes(ARGS.unicode_data_file)
|
||
+ HEAD = TAIL = ''
|
||
+ if ARGS.input_file:
|
||
+ (HEAD, TAIL) = read_input_file(ARGS.input_file)
|
||
+ with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
|
||
+ output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
|
||
+ output_transliteration(TRANSLIT_FILE)
|
||
+ output_tail(TRANSLIT_FILE, tail=TAIL)
|
||
diff --git a/localedata/unicode-gen/gen_translit_combining.py b/localedata/unicode-gen/gen_translit_combining.py
|
||
new file mode 100755
|
||
index 0000000..2551ce1
|
||
--- /dev/null
|
||
+++ b/localedata/unicode-gen/gen_translit_combining.py
|
||
@@ -0,0 +1,442 @@
|
||
+#!/usr/bin/python3
|
||
+# -*- coding: utf-8 -*-
|
||
+#
|
||
+# Generate a translit_combining file from a UnicodeData file.
|
||
+# Copyright (C) 2015 Free Software Foundation, Inc.
|
||
+# This file is part of the GNU C Library.
|
||
+#
|
||
+# The GNU C Library is free software; you can redistribute it and/or
|
||
+# modify it under the terms of the GNU Lesser General Public
|
||
+# License as published by the Free Software Foundation; either
|
||
+# version 2.1 of the License, or (at your option) any later version.
|
||
+#
|
||
+# The GNU C Library is distributed in the hope that it will be useful,
|
||
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
+# Lesser General Public License for more details.
|
||
+#
|
||
+# You should have received a copy of the GNU Lesser General Public
|
||
+# License along with the GNU C Library; if not, see
|
||
+# <http://www.gnu.org/licenses/>.
|
||
+
|
||
+'''
|
||
+Generate a translit_combining file from UnicodeData.txt
|
||
+
|
||
+To see how this script is used, call it with the “-h” option:
|
||
+
|
||
+ $ ./gen_translit_combining -h
|
||
+ … prints usage message …
|
||
+'''
|
||
+
|
||
+import argparse
|
||
+import time
|
||
+import unicode_utils
|
||
+
|
||
+def read_input_file(filename):
|
||
+ '''Reads the original glibc translit_combining file to get the
|
||
+ original head and tail.
|
||
+
|
||
+ We want to replace only the part of the file between
|
||
+ “translit_start” and “translit_end”
|
||
+ '''
|
||
+ head = tail = ''
|
||
+ with open(filename, mode='r') as translit_file:
|
||
+ for line in translit_file:
|
||
+ head = head + line
|
||
+ if line.startswith('translit_start'):
|
||
+ break
|
||
+ for line in translit_file:
|
||
+ if line.startswith('translit_end'):
|
||
+ tail = line
|
||
+ break
|
||
+ for line in translit_file:
|
||
+ tail = tail + line
|
||
+ return (head, tail)
|
||
+
|
||
+def output_head(translit_file, unicode_version, head=''):
|
||
+ '''Write the header of the output file, i.e. the part of the file
|
||
+ before the “translit_start” line.
|
||
+ '''
|
||
+ if ARGS.input_file and head:
|
||
+ translit_file.write(head)
|
||
+ else:
|
||
+ translit_file.write('escape_char /\n')
|
||
+ translit_file.write('comment_char %\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('% Transliterations that remove all ')
|
||
+ translit_file.write('combining characters (accents,\n')
|
||
+ translit_file.write('% pronounciation marks, etc.).\n')
|
||
+ translit_file.write('% Generated automatically from UnicodeData.txt '
|
||
+ + 'by gen_translit_combining.py '
|
||
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
|
||
+ + 'for Unicode {:s}.\n'.format(unicode_version))
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('LC_CTYPE\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('translit_start\n')
|
||
+
|
||
+def output_tail(translit_file, tail=''):
|
||
+ '''Write the tail of the output file'''
|
||
+ if ARGS.input_file and tail:
|
||
+ translit_file.write(tail)
|
||
+ else:
|
||
+ translit_file.write('translit_end\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('END LC_CTYPE\n')
|
||
+
|
||
+def is_combining_remove(code_point):
|
||
+ '''Check whether this is a combining character which should be listed
|
||
+ in the section of the translit_combining file where combining
|
||
+ characters are replaced by empty strings.
|
||
+
|
||
+ We ignore combining characters from many scripts here because
|
||
+ the original translit_combining file didn’t do this for the
|
||
+ combining characters from these scripts either and I am not
|
||
+ sure yet whether this would be useful to do for all combining
|
||
+ characters or not. For the moment I think it is better to keep
|
||
+ close to the spirit of the original file.
|
||
+ '''
|
||
+ if not unicode_utils.is_combining(code_point):
|
||
+ return False
|
||
+ name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ for substring in ('DEVANAGARI',
|
||
+ 'BENGALI',
|
||
+ 'CYRILLIC',
|
||
+ 'SYRIAC',
|
||
+ 'THAANA',
|
||
+ 'NKO',
|
||
+ 'GURMUKHI',
|
||
+ 'TAMIL',
|
||
+ 'GUJARATI',
|
||
+ 'ORIYA',
|
||
+ 'TELUGU',
|
||
+ 'KANNADA',
|
||
+ 'MALAYALAM',
|
||
+ 'SINHALA',
|
||
+ 'THAI',
|
||
+ 'LAO',
|
||
+ 'TIBETAN',
|
||
+ 'MYANMAR',
|
||
+ 'ETHIOPIC',
|
||
+ 'TAGALOG',
|
||
+ 'HANUNOO',
|
||
+ 'BUHID',
|
||
+ 'TAGBANWA',
|
||
+ 'KHMER',
|
||
+ 'MONGOLIAN',
|
||
+ 'LIMBU',
|
||
+ 'NEW TAI LUE',
|
||
+ 'BUGINESE',
|
||
+ 'BALINESE',
|
||
+ 'SUNDANESE',
|
||
+ 'LEPCHA',
|
||
+ 'IDEOGRAPHIC',
|
||
+ 'HANGUL',
|
||
+ 'SYLOTI',
|
||
+ 'SAURASHTRA',
|
||
+ 'KAYAH',
|
||
+ 'REJANG',
|
||
+ 'CHAM',
|
||
+ 'VARIATION SELECTOR',
|
||
+ 'KHAROSHTHI',
|
||
+ 'MUSICAL SYMBOL',
|
||
+ 'SAMARITAN',
|
||
+ 'MANDAIC',
|
||
+ 'TAI THAM',
|
||
+ 'BATAK',
|
||
+ 'VEDIC',
|
||
+ 'COPTIC',
|
||
+ 'TIFINAGH',
|
||
+ 'BAMUM',
|
||
+ 'JAVANESE',
|
||
+ 'TAI VIET',
|
||
+ 'MEETEI',
|
||
+ 'MANICHAEAN',
|
||
+ 'BRAHMI',
|
||
+ 'KAITHI',
|
||
+ 'CHAKMA',
|
||
+ 'MAHAJANI',
|
||
+ 'SHARADA',
|
||
+ 'KHOJKI',
|
||
+ 'KHUDAWADI',
|
||
+ 'GRANTHA',
|
||
+ 'TIRHUTA',
|
||
+ 'SIDDHAM',
|
||
+ 'MODI VOWEL',
|
||
+ 'MODI SIGN',
|
||
+ 'TAKRI',
|
||
+ 'BASSA VAH',
|
||
+ 'PAHAWH HMONG',
|
||
+ 'MIAO',
|
||
+ 'DUPLOYAN',
|
||
+ 'MENDE KIKAKUI'
|
||
+ ):
|
||
+ if substring in name:
|
||
+ return False
|
||
+ return True
|
||
+
|
||
+def canonical_decompose(code_point):
|
||
+ '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
|
||
+
|
||
+ In some instances a canonical mapping or a compatibility mapping
|
||
+ may consist of a single character. For a canonical mapping, this
|
||
+ indicates that the character is a canonical equivalent of another
|
||
+ single character. For a compatibility mapping, this indicates that
|
||
+ the character is a compatibility equivalent of another single
|
||
+ character.
|
||
+
|
||
+ A canonical mapping may also consist of a pair of characters, but
|
||
+ is never longer than two characters. When a canonical mapping
|
||
+ consists of a pair of characters, the first character may itself
|
||
+ be a character with a decomposition mapping, but the second
|
||
+ character never has a decomposition mapping.
|
||
+
|
||
+ We ignore the canonical decomposition for code points
|
||
+ matching certain substrings because the original translit_combining
|
||
+ file didn’t include these types of characters either. I am unsure
|
||
+ about the usefulness of including them and want to keep close
|
||
+ to the spirit of the original file for the moment.
|
||
+ '''
|
||
+ name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ for substring in ('MUSICAL SYMBOL',
|
||
+ 'CJK COMPATIBILITY IDEOGRAPH',
|
||
+ 'BALINESE',
|
||
+ 'KAITHI LETTER',
|
||
+ 'CHAKMA VOWEL',
|
||
+ 'GRANTHA VOWEL',
|
||
+ 'TIRHUTA VOWEL',
|
||
+ 'SIDDHAM VOWEL'):
|
||
+ if substring in name:
|
||
+ return []
|
||
+ decomposition = unicode_utils.UNICODE_ATTRIBUTES[
|
||
+ code_point]['decomposition']
|
||
+ if decomposition and not decomposition.startswith('<'):
|
||
+ decomposed_code_points = [int(x, 16) for x in decomposition.split(' ')]
|
||
+ if decomposed_code_points:
|
||
+ cd0 = canonical_decompose(decomposed_code_points[0])
|
||
+ if cd0:
|
||
+ decomposed_code_points = cd0 + decomposed_code_points[1:]
|
||
+ return decomposed_code_points
|
||
+ else:
|
||
+ return []
|
||
+
|
||
+def special_decompose(code_point_list):
|
||
+ '''
|
||
+ Decompositions which are not canonical or which are not in
|
||
+ UnicodeData.txt at all but some of these were used in the original
|
||
+ translit_combining file in glibc and they seemed to make sense.
|
||
+ I want to keep the update of translit_combining close to the
|
||
+ spirit of the original file, therefore I added these special
|
||
+ decomposition rules here.
|
||
+ '''
|
||
+ special_decompose_dict = {
|
||
+ # Ø U+00D8 is already handled in translit_neutral. But
|
||
+ # translit_combining is usually included after translit_neutral
|
||
+ # and Ǿ U+01FE LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
|
||
+ # has a canonical decomposition to Ø U+00D8 and we want to
|
||
+ # further decompose this to U+004F.
|
||
+ (0x00D8,): [0x004F], # Ø → O
|
||
+ # ø U+00F8 is already handled in translit_neutral. But
|
||
+ # translit_combining is usually included after translit_neutral
|
||
+ # and ǿ U+01FF LATIN SMALL LETTER O WITH STROKE AND ACUTE
|
||
+ # has a canonical decomposition to ø U+00F8 and we want to
|
||
+ # further decompose this to U+006F.
|
||
+ (0x00F8,): [0x006F], # ø → o
|
||
+ # æ U+00E6 is already in translit_compat because ligatures
|
||
+ # are handled in translit_compat. But ǣ U+01E3 has a
|
||
+ # canonical decomposition to U+00E6, U+0304 and we want to
|
||
+ # further decompose this to “ae”.
|
||
+ (0x00E6,): [0x0061, 0x0065], # æ → ae
|
||
+ # Æ U+00C6 is already in translit_compat because ligatures
|
||
+ # are handled in translit_compat. But Ǣ U+01E2 has a
|
||
+ # canonical decomposition to U+00C6, U+0304 and we want to
|
||
+ # further decompose this to “AE”
|
||
+ (0x00C6,): [0x0041, 0x0045], # Æ → AE
|
||
+ # U+05F2 HEBREW LIGATURE YIDDISH DOUBLE YOD is already in
|
||
+ # translit_compat because ligatures are handled in translit_compat.
|
||
+ # But U+FB1F has a canonical decomposition to U+05F2 and
|
||
+ # we want to further decompose this to U+05D9, U+05D9.
|
||
+ (0x05F2,): [0x05D9, 0x05D9], # ײ → יי
|
||
+ # 0x2002 has a <compat> decomposition to 0x0020 in UnicodeData.txt
|
||
+ # But U+2000 EN QUAD has a canonical decomposition U+2002
|
||
+ # and we want to further decompose this to U+0020.
|
||
+ (0x2002,): [0x0020], # EN SPACE → SPACE
|
||
+ # 0x2003 has a <compat> decomposition to 0x0020 in UnicodeData.txt
|
||
+ # But U+2001 EM QUAD has a canonical decomposition to U+2003
|
||
+ # and we want to further decompose this to U+0020.
|
||
+ (0x2003,): [0x0020], # EM SPACE → SPACE
|
||
+ # U+2260 ≠ has the canonical decomposition U+003D U+0338
|
||
+ # (= followed by ̸). After stripping the combining characters,
|
||
+ # the result is only = which reverses the meaning.
|
||
+ # Therefore, we add a special rules here for such mathematical
|
||
+ # negations:
|
||
+ (0x21AE,): [0x0021, 0x003C, 0x002D, 0x003E], # ↮ → !<->
|
||
+ (0x21CD,): [0x0021, 0x003C, 0x003D], # ⇍ → !<=
|
||
+ (0x21CE,): [0x0021, 0x003C, 0x003D, 0x003E], # ⇎ → !<=>
|
||
+ (0x21CF,): [0x0021, 0x003D, 0x003E], # ⇏ → !=>
|
||
+ (0x2204,): [0x0021, 0x2203], # ∄ → !∃
|
||
+ (0x2209,): [0x0021, 0x2208], # ∉ → !∈
|
||
+ (0x220C,): [0x0021, 0x220B], # ∌ → !∋
|
||
+ (0x2224,): [0x0021, 0x2223], # ∤ → !∣
|
||
+ (0x2226,): [0x0021, 0x2225], # ∦ → !∥
|
||
+ (0x2241,): [0x0021, 0x007E], # ≁ → !~
|
||
+ (0x2244,): [0x0021, 0x007E, 0x002D], # ≄ → !~-
|
||
+ (0x2247,): [0x0021, 0x007E, 0x003D], # ≇ → !~=
|
||
+ (0x2249,): [0x0021, 0x007E, 0x007E], # ≉ → !~~
|
||
+ (0x2260,): [0x0021, 0x003D], # ≠ → !=
|
||
+ (0x2262,): [0x0021, 0x003D, 0x003D], # ≢ → !==
|
||
+ (0x226D,): [0x0021, 0x224D], # ≭ → !≍
|
||
+ (0x226E,): [0x0021, 0x003C], # ≮ → !<
|
||
+ (0x226F,): [0x0021, 0x003E], # ≯ → !>
|
||
+ (0x2270,): [0x0021, 0x003C, 0x003D], # ≰ → !<=
|
||
+ (0x2271,): [0x0021, 0x003E, 0x003D], # ≱ → !>=
|
||
+ (0x2274,): [0x0021, 0x003C, 0x007E], # ≴ → !<~
|
||
+ (0x2275,): [0x0021, 0x003E, 0x007E], # ≵ → !>~
|
||
+ (0x2278,): [0x0021, 0x003C, 0x003E], # ≸ → !<>
|
||
+ (0x2279,): [0x0021, 0x003E, 0x003C], # ≹ → !><
|
||
+ (0x2280,): [0x0021, 0x227A], # ⊀ → !≺
|
||
+ (0x2281,): [0x0021, 0x227B], # ⊁ → !≻
|
||
+ (0x2284,): [0x0021, 0x2282], # ⊄ → !⊂
|
||
+ (0x2285,): [0x0021, 0x2283], # ⊅ → !⊃
|
||
+ (0x2288,): [0x0021, 0x2282, 0x003D], # ⊈ → !⊂=
|
||
+ (0x2289,): [0x0021, 0x2283, 0x003D], # ⊉ → !⊃=
|
||
+ (0x22AC,): [0x0021, 0x22A2], # ⊬ → !⊢
|
||
+ (0x22AD,): [0x0021, 0x22A8], # ⊭ → !⊨
|
||
+ (0x22AE,): [0x0021, 0x22A9], # ⊮ → !⊩
|
||
+ (0x22AF,): [0x0021, 0x22AB], # ⊯ → !⊫
|
||
+ (0x22E0,): [0x0021, 0x227C], # ⋠ → !≼
|
||
+ (0x22E1,): [0x0021, 0x227D], # ⋡ → !≽
|
||
+ (0x22E2,): [0x0021, 0x2291], # ⋢ → !⊑
|
||
+ (0x22E3,): [0x0021, 0x2292], # ⋣ → !⊒
|
||
+ (0x22EA,): [0x0021, 0x22B2], # ⋪ → !⊲
|
||
+ (0x22EB,): [0x0021, 0x22B3], # ⋫ → !⊳
|
||
+ (0x22EC,): [0x0021, 0x22B4], # ⋬ → !⊴
|
||
+ (0x22ED,): [0x0021, 0x22B5], # ⋭ → !⊵
|
||
+ (0x2ADC,): [0x0021, 0x2ADD], # ⫝̸ → !⫝
|
||
+ # Special rule for 〈 U+3008 is added
|
||
+ # because 〉 U+2329 has the canonical decomposition U+3008
|
||
+ # and we want to further decompose this to > U+003C.
|
||
+ (0x3008,): [0x003C], # 〈 → <
|
||
+ # Special rule for 〉 U+3009 is added
|
||
+ # because 〉 U+232A has the canonical decomposition U+3009
|
||
+ # and we want to further decompose this to < U+003E.
|
||
+ (0x3009,): [0x003E], # 〉→ >
|
||
+ }
|
||
+ if tuple(code_point_list) in special_decompose_dict:
|
||
+ return special_decompose_dict[tuple(code_point_list)]
|
||
+ else:
|
||
+ return code_point_list
|
||
+
|
||
+def output_combining_remove(translit_file):
|
||
+ '''Write the section of the translit_combining file where combining
|
||
+ characters are replaced by empty strings.
|
||
+ '''
|
||
+ translit_file.write('\n')
|
||
+ for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
|
||
+ name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ if is_combining_remove(code_point):
|
||
+ translit_file.write('% {:s}\n'.format(name))
|
||
+ translit_file.write('{:s} ""\n'.format(
|
||
+ unicode_utils.ucs_symbol(code_point)))
|
||
+ translit_file.write('\n')
|
||
+
|
||
+def output_decompositions(translit_file):
|
||
+ '''Write the section of the translit_combining file where characters
|
||
+ characters are decomposed and combining characters stripped from
|
||
+ the decompositions.
|
||
+ '''
|
||
+ for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
|
||
+ if special_decompose([code_point]) != [code_point]:
|
||
+ decomposed_code_points = [special_decompose([code_point])]
|
||
+ else:
|
||
+ decomposed_code_points = [canonical_decompose(code_point)]
|
||
+ if decomposed_code_points[0]:
|
||
+ while True:
|
||
+ special_decomposed_code_points = special_decompose(
|
||
+ decomposed_code_points[-1])
|
||
+ if (special_decomposed_code_points
|
||
+ != decomposed_code_points[-1]):
|
||
+ decomposed_code_points.append(
|
||
+ special_decomposed_code_points)
|
||
+ continue
|
||
+ special_decomposed_code_points = []
|
||
+ for decomposed_code_point in decomposed_code_points[-1]:
|
||
+ special_decomposed_code_points += special_decompose(
|
||
+ [decomposed_code_point])
|
||
+ if (special_decomposed_code_points
|
||
+ == decomposed_code_points[-1]):
|
||
+ break
|
||
+ decomposed_code_points.append(
|
||
+ special_decomposed_code_points)
|
||
+ for index in range(0, len(decomposed_code_points)):
|
||
+ decomposed_code_points[index] = [
|
||
+ x for x in decomposed_code_points[index]
|
||
+ if not is_combining_remove(x)]
|
||
+ if decomposed_code_points[0]:
|
||
+ translit_file.write('% {:s}\n'.format(
|
||
+ unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']))
|
||
+ translit_file.write('{:s} '.format(
|
||
+ unicode_utils.ucs_symbol(code_point)))
|
||
+ for index in range(0, len(decomposed_code_points)):
|
||
+ if index > 0:
|
||
+ translit_file.write(';')
|
||
+ if len(decomposed_code_points[index]) > 1:
|
||
+ translit_file.write('"')
|
||
+ for decomposed_code_point in decomposed_code_points[index]:
|
||
+ translit_file.write('{:s}'.format(
|
||
+ unicode_utils.ucs_symbol(decomposed_code_point)))
|
||
+ if len(decomposed_code_points[index]) > 1:
|
||
+ translit_file.write('"')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('\n')
|
||
+
|
||
+def output_transliteration(translit_file):
|
||
+ '''Write the new transliteration to the output file'''
|
||
+ output_combining_remove(translit_file)
|
||
+ output_decompositions(translit_file)
|
||
+
|
||
+if __name__ == "__main__":
|
||
+ PARSER = argparse.ArgumentParser(
|
||
+ description='''
|
||
+ Generate a translit_combining file from UnicodeData.txt.
|
||
+ ''')
|
||
+ PARSER.add_argument(
|
||
+ '-u', '--unicode_data_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ default='UnicodeData.txt',
|
||
+ help=('The UnicodeData.txt file to read, '
|
||
+ + 'default: %(default)s'))
|
||
+ PARSER.add_argument(
|
||
+ '-i', '--input_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ help=''' The original glibc/localedata/locales/translit_combining
|
||
+ file.''')
|
||
+ PARSER.add_argument(
|
||
+ '-o', '--output_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ default='translit_combining.new',
|
||
+ help='''The new translit_combining file, default: %(default)s. If the
|
||
+ original glibc/localedata/locales/translit_combining file has
|
||
+ been given as an option, the header up to the
|
||
+ “translit_start” line and the tail from the “translit_end”
|
||
+ line to the end of the file will be copied unchanged into the
|
||
+ output file. ''')
|
||
+ PARSER.add_argument(
|
||
+ '--unicode_version',
|
||
+ nargs='?',
|
||
+ required=True,
|
||
+ type=str,
|
||
+ help='The Unicode version of the input files used.')
|
||
+ ARGS = PARSER.parse_args()
|
||
+
|
||
+ unicode_utils.fill_attributes(ARGS.unicode_data_file)
|
||
+ HEAD = TAIL = ''
|
||
+ if ARGS.input_file:
|
||
+ (HEAD, TAIL) = read_input_file(ARGS.input_file)
|
||
+ with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
|
||
+ output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
|
||
+ output_transliteration(TRANSLIT_FILE)
|
||
+ output_tail(TRANSLIT_FILE, tail=TAIL)
|
||
diff --git a/localedata/unicode-gen/gen_translit_compat.py b/localedata/unicode-gen/gen_translit_compat.py
|
||
new file mode 100755
|
||
index 0000000..0e824a8
|
||
--- /dev/null
|
||
+++ b/localedata/unicode-gen/gen_translit_compat.py
|
||
@@ -0,0 +1,326 @@
|
||
+#!/usr/bin/python3
|
||
+# -*- coding: utf-8 -*-
|
||
+#
|
||
+# Generate a translit_compat file from a UnicodeData file.
|
||
+# Copyright (C) 2015 Free Software Foundation, Inc.
|
||
+# This file is part of the GNU C Library.
|
||
+#
|
||
+# The GNU C Library is free software; you can redistribute it and/or
|
||
+# modify it under the terms of the GNU Lesser General Public
|
||
+# License as published by the Free Software Foundation; either
|
||
+# version 2.1 of the License, or (at your option) any later version.
|
||
+#
|
||
+# The GNU C Library is distributed in the hope that it will be useful,
|
||
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
+# Lesser General Public License for more details.
|
||
+#
|
||
+# You should have received a copy of the GNU Lesser General Public
|
||
+# License along with the GNU C Library; if not, see
|
||
+# <http://www.gnu.org/licenses/>.
|
||
+
|
||
+'''
|
||
+Generate a translit_compat file from UnicodeData.txt
|
||
+
|
||
+To see how this script is used, call it with the “-h” option:
|
||
+
|
||
+ $ ./gen_translit_compat -h
|
||
+ … prints usage message …
|
||
+'''
|
||
+
|
||
+import argparse
|
||
+import time
|
||
+import unicode_utils
|
||
+
|
||
+def read_input_file(filename):
|
||
+ '''Reads the original glibc translit_compat file to get the
|
||
+ original head and tail.
|
||
+
|
||
+ We want to replace only the part of the file between
|
||
+ “translit_start” and “translit_end”
|
||
+ '''
|
||
+ head = tail = ''
|
||
+ with open(filename, mode='r') as translit_file:
|
||
+ for line in translit_file:
|
||
+ head = head + line
|
||
+ if line.startswith('translit_start'):
|
||
+ break
|
||
+ for line in translit_file:
|
||
+ if line.startswith('translit_end'):
|
||
+ tail = line
|
||
+ break
|
||
+ for line in translit_file:
|
||
+ tail = tail + line
|
||
+ return (head, tail)
|
||
+
|
||
+def output_head(translit_file, unicode_version, head=''):
|
||
+ '''Write the header of the output file, i.e. the part of the file
|
||
+ before the “translit_start” line.
|
||
+ '''
|
||
+ if ARGS.input_file and head:
|
||
+ translit_file.write(head)
|
||
+ else:
|
||
+ translit_file.write('escape_char /\n')
|
||
+ translit_file.write('comment_char %\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('% Transliterations of compatibility characters ')
|
||
+ translit_file.write('and ligatures.\n')
|
||
+ translit_file.write('% Generated automatically from UnicodeData.txt '
|
||
+ + 'by gen_translit_compat.py '
|
||
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
|
||
+ + 'for Unicode {:s}.\n'.format(unicode_version))
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('LC_CTYPE\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('translit_start\n')
|
||
+
|
||
+def output_tail(translit_file, tail=''):
|
||
+ '''Write the tail of the output file'''
|
||
+ if ARGS.input_file and tail:
|
||
+ translit_file.write(tail)
|
||
+ else:
|
||
+ translit_file.write('translit_end\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('END LC_CTYPE\n')
|
||
+
|
||
+def compatibility_decompose(code_point):
|
||
+ '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
|
||
+
|
||
+ “The compatibility decomposition is formed by recursively applying
|
||
+ the canonical and compatibility mappings, then applying the
|
||
+ Canonical Ordering Algorithm.”
|
||
+
|
||
+ We don’t do the canonical decomposition here because this is
|
||
+ done in gen_translit_combining.py to generate translit_combining.
|
||
+
|
||
+ And we ignore some of the possible compatibility formatting tags
|
||
+ here. Some of them are used in other translit_* files, not
|
||
+ translit_compat:
|
||
+
|
||
+ <font>: translit_font
|
||
+ <circle>: translit_circle
|
||
+ <wide>: translit_wide
|
||
+ <narrow>: translit_narrow
|
||
+ <square>: translit_cjk_compat
|
||
+ <fraction>: translit_fraction
|
||
+
|
||
+ And we ignore
|
||
+
|
||
+ <noBreak>, <initial>, <medial>, <final>, <isolated>
|
||
+
|
||
+ because they seem to be not useful for transliteration.
|
||
+ '''
|
||
+ decomposition = unicode_utils.UNICODE_ATTRIBUTES[
|
||
+ code_point]['decomposition']
|
||
+ compatibility_tags = (
|
||
+ '<compat>', '<super>', '<sub>', '<vertical>')
|
||
+ for compatibility_tag in compatibility_tags:
|
||
+ if decomposition.startswith(compatibility_tag):
|
||
+ decomposition = decomposition[len(compatibility_tag)+1:]
|
||
+ decomposed_code_points = [int(x, 16)
|
||
+ for x in decomposition.split(' ')]
|
||
+ if (len(decomposed_code_points) > 1
|
||
+ and decomposed_code_points[0] == 0x0020
|
||
+ and decomposed_code_points[1] >= 0x0300
|
||
+ and decomposed_code_points[1] <= 0x03FF):
|
||
+ # Decomposes into a space followed by a combining character.
|
||
+ # This is not useful fo transliteration.
|
||
+ return []
|
||
+ else:
|
||
+ return_value = []
|
||
+ for index in range(0, len(decomposed_code_points)):
|
||
+ cd_code_points = compatibility_decompose(
|
||
+ decomposed_code_points[index])
|
||
+ if cd_code_points:
|
||
+ return_value += cd_code_points
|
||
+ else:
|
||
+ return_value += [decomposed_code_points[index]]
|
||
+ return return_value
|
||
+ return []
|
||
+
|
||
+def special_decompose(code_point_list):
|
||
+ '''
|
||
+ Decompositions which are not in UnicodeData.txt at all but which
|
||
+ were used in the original translit_compat file in glibc and
|
||
+ which seem to make sense. I want to keep the update of
|
||
+ translit_compat close to the spirit of the original file,
|
||
+ therefore I added this special decomposition rules here.
|
||
+ '''
|
||
+ special_decompose_dict = {
|
||
+ (0x03BC,): [0x0075], # μ → u
|
||
+ (0x02BC,): [0x0027], # ʼ → '
|
||
+ }
|
||
+ if tuple(code_point_list) in special_decompose_dict:
|
||
+ return special_decompose_dict[tuple(code_point_list)]
|
||
+ else:
|
||
+ return code_point_list
|
||
+
|
||
+def special_ligature_decompose(code_point):
|
||
+ '''
|
||
+ Decompositions for ligatures which are not in UnicodeData.txt at
|
||
+ all but which were used in the original translit_compat file in
|
||
+ glibc and which seem to make sense. I want to keep the update of
|
||
+ translit_compat close to the spirit of the original file,
|
||
+ therefore I added these special ligature decomposition rules here.
|
||
+
|
||
+ '''
|
||
+ special_ligature_decompose_dict = {
|
||
+ 0x00E6: [0x0061, 0x0065], # æ → ae
|
||
+ 0x00C6: [0x0041, 0x0045], # Æ → AE
|
||
+ # These following 5 special ligature decompositions were
|
||
+ # in the original glibc/localedata/locales/translit_compat file
|
||
+ 0x0152: [0x004F, 0x0045], # Œ → OE
|
||
+ 0x0153: [0x006F, 0x0065], # œ → oe
|
||
+ 0x05F0: [0x05D5, 0x05D5], # װ → וו
|
||
+ 0x05F1: [0x05D5, 0x05D9], # ױ → וי
|
||
+ 0x05F2: [0x05D9, 0x05D9], # ײ → יי
|
||
+ # The following special ligature decompositions were
|
||
+ # not in the original glibc/localedata/locales/translit_compat file
|
||
+ # U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE
|
||
+ # → U+041D CYRILLIC CAPITAL LETTER EN,
|
||
+ # U+0413 CYRILLIC CAPITAL LETTER GHE
|
||
+ 0x04A4: [0x041D, 0x0413], # Ҥ → НГ
|
||
+ # U+04A5 CYRILLIC SMALL LIGATURE EN GHE
|
||
+ # → U+043D CYRILLIC SMALL LETTER EN,
|
||
+ # U+0433 CYRILLIC SMALL LETTER GHE
|
||
+ 0x04A5: [0x043D, 0x0433], # ҥ → нг
|
||
+ # U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE
|
||
+ # → U+0422 CYRILLIC CAPITAL LETTER TE,
|
||
+ # U+0426 CYRILLIC CAPITAL LETTER TSE
|
||
+ 0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ
|
||
+ # U+04B5 CYRILLIC SMALL LIGATURE TE TSE
|
||
+ # → U+0442 CYRILLIC SMALL LETTER TE,
|
||
+ # U+0446 CYRILLIC SMALL LETTER TSE
|
||
+ 0x04B5: [0x0442, 0x0446], # ҵ → тц
|
||
+ # U+04d4 CYRILLIC CAPITAL LIGATURE A IE
|
||
+ # → U+0410 CYRILLIC CAPITAL LETTER A
|
||
+ # U+0415;CYRILLIC CAPITAL LETTER IE
|
||
+ 0x04D4: [0x0410, 0x0415], # Ӕ → АЕ
|
||
+ # U+04D5 CYRILLIC SMALL LIGATURE A IE
|
||
+ # → U+0430 CYRILLIC SMALL LETTER A,
|
||
+ # U+0435 CYRILLIC SMALL LETTER IE
|
||
+ 0x04D5: [0x0430, 0x0435], # ӕ → ае
|
||
+ # I am not sure what to do with the following ligatures
|
||
+ # maybe it makes no sense to decompose them:
|
||
+ # U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
|
||
+ # U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
|
||
+ # U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
|
||
+ # U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
|
||
+ # U+fe20 COMBINING LIGATURE LEFT HALF
|
||
+ # U+fe21 COMBINING LIGATURE RIGHT HALF
|
||
+ # U+fe27 COMBINING LIGATURE LEFT HALF BELOW
|
||
+ # U+fe28 COMBINING LIGATURE RIGHT HALF BELOW
|
||
+ # U+11176 MAHAJANI LIGATURE SHRI
|
||
+ # U+1f670 SCRIPT LIGATURE ET ORNAMENT
|
||
+ # U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT
|
||
+ # U+1f672 LIGATURE OPEN ET ORNAMENT
|
||
+ # U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT
|
||
+ }
|
||
+ if code_point in special_ligature_decompose_dict:
|
||
+ return special_ligature_decompose_dict[code_point]
|
||
+ else:
|
||
+ return [code_point]
|
||
+
|
||
+def output_transliteration(translit_file):
|
||
+ '''Write the new transliteration to the output file'''
|
||
+ translit_file.write('\n')
|
||
+ for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
|
||
+ name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ decomposed_code_points = [compatibility_decompose(code_point)]
|
||
+ if not decomposed_code_points[0]:
|
||
+ if special_decompose([code_point]) != [code_point]:
|
||
+ decomposed_code_points[0] = special_decompose([code_point])
|
||
+ else:
|
||
+ special_decomposed_code_points = []
|
||
+ while True:
|
||
+ special_decomposed_code_points = special_decompose(
|
||
+ decomposed_code_points[-1])
|
||
+ if (special_decomposed_code_points
|
||
+ != decomposed_code_points[-1]):
|
||
+ decomposed_code_points.append(
|
||
+ special_decomposed_code_points)
|
||
+ continue
|
||
+ special_decomposed_code_points = []
|
||
+ for decomposed_code_point in decomposed_code_points[-1]:
|
||
+ special_decomposed_code_points += special_decompose(
|
||
+ [decomposed_code_point])
|
||
+ if (special_decomposed_code_points
|
||
+ == decomposed_code_points[-1]):
|
||
+ break
|
||
+ decomposed_code_points.append(
|
||
+ special_decomposed_code_points)
|
||
+ if decomposed_code_points[0]:
|
||
+ translit_file.write('% {:s}\n'.format(name))
|
||
+ translit_file.write('{:s} '.format(
|
||
+ unicode_utils.ucs_symbol(code_point)))
|
||
+ for index in range(0, len(decomposed_code_points)):
|
||
+ if index > 0:
|
||
+ translit_file.write(';')
|
||
+ translit_file.write('"')
|
||
+ for decomposed_code_point in decomposed_code_points[index]:
|
||
+ translit_file.write('{:s}'.format(
|
||
+ unicode_utils.ucs_symbol(decomposed_code_point)))
|
||
+ translit_file.write('"')
|
||
+ translit_file.write('\n')
|
||
+ elif 'LIGATURE' in name and 'ARABIC' not in name:
|
||
+ decomposed_code_points = special_ligature_decompose(code_point)
|
||
+ if decomposed_code_points[0] != code_point:
|
||
+ translit_file.write('% {:s}\n'.format(name))
|
||
+ translit_file.write('{:s} '.format(
|
||
+ unicode_utils.ucs_symbol(code_point)))
|
||
+ translit_file.write('"')
|
||
+ for decomposed_code_point in decomposed_code_points:
|
||
+ translit_file.write('{:s}'.format(
|
||
+ unicode_utils.ucs_symbol(decomposed_code_point)))
|
||
+ translit_file.write('"')
|
||
+ translit_file.write('\n')
|
||
+ else:
|
||
+ print('Warning: unhandled ligature: {:x} {:s}'.format(
|
||
+ code_point, name))
|
||
+ translit_file.write('\n')
|
||
+
|
||
+if __name__ == "__main__":
|
||
+ PARSER = argparse.ArgumentParser(
|
||
+ description='''
|
||
+ Generate a translit_compat file from UnicodeData.txt.
|
||
+ ''')
|
||
+ PARSER.add_argument(
|
||
+ '-u', '--unicode_data_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ default='UnicodeData.txt',
|
||
+ help=('The UnicodeData.txt file to read, '
|
||
+ + 'default: %(default)s'))
|
||
+ PARSER.add_argument(
|
||
+ '-i', '--input_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ help=''' The original glibc/localedata/locales/translit_compat
|
||
+ file.''')
|
||
+ PARSER.add_argument(
|
||
+ '-o', '--output_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ default='translit_compat.new',
|
||
+ help='''The new translit_compat file, default: %(default)s. If the
|
||
+ original glibc/localedata/locales/translit_compat file has
|
||
+ been given as an option, the header up to the
|
||
+ “translit_start” line and the tail from the “translit_end”
|
||
+ line to the end of the file will be copied unchanged into the
|
||
+ output file. ''')
|
||
+ PARSER.add_argument(
|
||
+ '--unicode_version',
|
||
+ nargs='?',
|
||
+ required=True,
|
||
+ type=str,
|
||
+ help='The Unicode version of the input files used.')
|
||
+ ARGS = PARSER.parse_args()
|
||
+
|
||
+ unicode_utils.fill_attributes(ARGS.unicode_data_file)
|
||
+ HEAD = TAIL = ''
|
||
+ if ARGS.input_file:
|
||
+ (HEAD, TAIL) = read_input_file(ARGS.input_file)
|
||
+ with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
|
||
+ output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
|
||
+ output_transliteration(TRANSLIT_FILE)
|
||
+ output_tail(TRANSLIT_FILE, tail=TAIL)
|
||
diff --git a/localedata/unicode-gen/gen_translit_font.py b/localedata/unicode-gen/gen_translit_font.py
|
||
new file mode 100755
|
||
index 0000000..0723622
|
||
--- /dev/null
|
||
+++ b/localedata/unicode-gen/gen_translit_font.py
|
||
@@ -0,0 +1,156 @@
|
||
+#!/usr/bin/python3
|
||
+# -*- coding: utf-8 -*-
|
||
+#
|
||
+# Generate a translit_font file from a UnicodeData file.
|
||
+# Copyright (C) 2015 Free Software Foundation, Inc.
|
||
+# This file is part of the GNU C Library.
|
||
+#
|
||
+# The GNU C Library is free software; you can redistribute it and/or
|
||
+# modify it under the terms of the GNU Lesser General Public
|
||
+# License as published by the Free Software Foundation; either
|
||
+# version 2.1 of the License, or (at your option) any later version.
|
||
+#
|
||
+# The GNU C Library is distributed in the hope that it will be useful,
|
||
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
+# Lesser General Public License for more details.
|
||
+#
|
||
+# You should have received a copy of the GNU Lesser General Public
|
||
+# License along with the GNU C Library; if not, see
|
||
+# <http://www.gnu.org/licenses/>.
|
||
+
|
||
+'''
|
||
+Generate a translit_font file from UnicodeData.txt
|
||
+
|
||
+To see how this script is used, call it with the “-h” option:
|
||
+
|
||
+ $ ./gen_translit_font -h
|
||
+ … prints usage message …
|
||
+'''
|
||
+
|
||
+import argparse
|
||
+import time
|
||
+import unicode_utils
|
||
+
|
||
+def read_input_file(filename):
|
||
+ '''Reads the original glibc translit_font file to get the
|
||
+ original head and tail.
|
||
+
|
||
+ We want to replace only the part of the file between
|
||
+ “translit_start” and “translit_end”
|
||
+ '''
|
||
+ head = tail = ''
|
||
+ with open(filename, mode='r') as translit_file:
|
||
+ for line in translit_file:
|
||
+ head = head + line
|
||
+ if line.startswith('translit_start'):
|
||
+ break
|
||
+ for line in translit_file:
|
||
+ if line.startswith('translit_end'):
|
||
+ tail = line
|
||
+ break
|
||
+ for line in translit_file:
|
||
+ tail = tail + line
|
||
+ return (head, tail)
|
||
+
|
||
+def output_head(translit_file, unicode_version, head=''):
|
||
+ '''Write the header of the output file, i.e. the part of the file
|
||
+ before the “translit_start” line.
|
||
+ '''
|
||
+ if ARGS.input_file and head:
|
||
+ translit_file.write(head)
|
||
+ else:
|
||
+ translit_file.write('escape_char /\n')
|
||
+ translit_file.write('comment_char %\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('% Transliterations of font equivalents.\n')
|
||
+ translit_file.write('% Generated automatically from UnicodeData.txt '
|
||
+ + 'by gen_translit_font.py '
|
||
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
|
||
+ + 'for Unicode {:s}.\n'.format(unicode_version))
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('LC_CTYPE\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('translit_start\n')
|
||
+
|
||
+def output_tail(translit_file, tail=''):
|
||
+ '''Write the tail of the output file'''
|
||
+ if ARGS.input_file and tail:
|
||
+ translit_file.write(tail)
|
||
+ else:
|
||
+ translit_file.write('translit_end\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('END LC_CTYPE\n')
|
||
+
|
||
+def output_transliteration(translit_file):
|
||
+ '''Write the new transliteration to the output file'''
|
||
+ translit_file.write('\n')
|
||
+ for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
|
||
+ name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ decomposition = unicode_utils.UNICODE_ATTRIBUTES[
|
||
+ code_point]['decomposition']
|
||
+ if decomposition.startswith('<font>'):
|
||
+ decomposition = decomposition[7:]
|
||
+ decomposed_code_points = [[int(x, 16)
|
||
+ for x in decomposition.split(' ')]]
|
||
+ if decomposed_code_points[0]:
|
||
+ translit_file.write('{:s} '.format(
|
||
+ unicode_utils.ucs_symbol(code_point)))
|
||
+ for index in range(0, len(decomposed_code_points)):
|
||
+ if index > 0:
|
||
+ translit_file.write(';')
|
||
+ if len(decomposed_code_points[index]) > 1:
|
||
+ translit_file.write('"')
|
||
+ for decomposed_code_point in decomposed_code_points[index]:
|
||
+ translit_file.write('{:s}'.format(
|
||
+ unicode_utils.ucs_symbol(decomposed_code_point)))
|
||
+ if len(decomposed_code_points[index]) > 1:
|
||
+ translit_file.write('"')
|
||
+ translit_file.write(' % {:s}\n'.format(name))
|
||
+ translit_file.write('\n')
|
||
+
|
||
+if __name__ == "__main__":
|
||
+ PARSER = argparse.ArgumentParser(
|
||
+ description='''
|
||
+ Generate a translit_font file from UnicodeData.txt.
|
||
+ ''')
|
||
+ PARSER.add_argument(
|
||
+ '-u', '--unicode_data_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ default='UnicodeData.txt',
|
||
+ help=('The UnicodeData.txt file to read, '
|
||
+ + 'default: %(default)s'))
|
||
+ PARSER.add_argument(
|
||
+ '-i', '--input_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ help=''' The original glibc/localedata/locales/translit_font
|
||
+ file.''')
|
||
+ PARSER.add_argument(
|
||
+ '-o', '--output_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ default='translit_font.new',
|
||
+ help='''The new translit_font file, default: %(default)s. If the
|
||
+ original glibc/localedata/locales/translit_font file has
|
||
+ been given as an option, the header up to the
|
||
+ “translit_start” line and the tail from the “translit_end”
|
||
+ line to the end of the file will be copied unchanged into the
|
||
+ output file. ''')
|
||
+ PARSER.add_argument(
|
||
+ '--unicode_version',
|
||
+ nargs='?',
|
||
+ required=True,
|
||
+ type=str,
|
||
+ help='The Unicode version of the input files used.')
|
||
+ ARGS = PARSER.parse_args()
|
||
+
|
||
+ unicode_utils.fill_attributes(ARGS.unicode_data_file)
|
||
+ HEAD = TAIL = ''
|
||
+ if ARGS.input_file:
|
||
+ (HEAD, TAIL) = read_input_file(ARGS.input_file)
|
||
+ with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
|
||
+ output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
|
||
+ output_transliteration(TRANSLIT_FILE)
|
||
+ output_tail(TRANSLIT_FILE, tail=TAIL)
|
||
diff --git a/localedata/unicode-gen/gen_translit_fraction.py b/localedata/unicode-gen/gen_translit_fraction.py
|
||
new file mode 100755
|
||
index 0000000..5bf63ea
|
||
--- /dev/null
|
||
+++ b/localedata/unicode-gen/gen_translit_fraction.py
|
||
@@ -0,0 +1,197 @@
|
||
+#!/usr/bin/python3
|
||
+# -*- coding: utf-8 -*-
|
||
+#
|
||
+# Generate a translit_fraction file from a UnicodeData file.
|
||
+# Copyright (C) 2015 Free Software Foundation, Inc.
|
||
+# This file is part of the GNU C Library.
|
||
+#
|
||
+# The GNU C Library is free software; you can redistribute it and/or
|
||
+# modify it under the terms of the GNU Lesser General Public
|
||
+# License as published by the Free Software Foundation; either
|
||
+# version 2.1 of the License, or (at your option) any later version.
|
||
+#
|
||
+# The GNU C Library is distributed in the hope that it will be useful,
|
||
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
+# Lesser General Public License for more details.
|
||
+#
|
||
+# You should have received a copy of the GNU Lesser General Public
|
||
+# License along with the GNU C Library; if not, see
|
||
+# <http://www.gnu.org/licenses/>.
|
||
+
|
||
+'''
|
||
+Generate a translit_fraction file from UnicodeData.txt
|
||
+
|
||
+To see how this script is used, call it with the “-h” option:
|
||
+
|
||
+ $ ./gen_translit_fraction -h
|
||
+ … prints usage message …
|
||
+'''
|
||
+
|
||
+import argparse
|
||
+import time
|
||
+import unicode_utils
|
||
+
|
||
+def read_input_file(filename):
|
||
+ '''Reads the original glibc translit_fraction file to get the
|
||
+ original head and tail.
|
||
+
|
||
+ We want to replace only the part of the file between
|
||
+ “translit_start” and “translit_end”
|
||
+ '''
|
||
+ head = tail = ''
|
||
+ with open(filename, mode='r') as translit_file:
|
||
+ for line in translit_file:
|
||
+ head = head + line
|
||
+ if line.startswith('translit_start'):
|
||
+ break
|
||
+ for line in translit_file:
|
||
+ if line.startswith('translit_end'):
|
||
+ tail = line
|
||
+ break
|
||
+ for line in translit_file:
|
||
+ tail = tail + line
|
||
+ return (head, tail)
|
||
+
|
||
+def output_head(translit_file, unicode_version, head=''):
|
||
+ '''Write the header of the output file, i.e. the part of the file
|
||
+ before the “translit_start” line.
|
||
+ '''
|
||
+ if ARGS.input_file and head:
|
||
+ translit_file.write(head)
|
||
+ else:
|
||
+ translit_file.write('escape_char /\n')
|
||
+ translit_file.write('comment_char %\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('% Transliterations of fractions.\n')
|
||
+ translit_file.write('% Generated automatically from UnicodeData.txt '
|
||
+ + 'by gen_translit_fraction.py '
|
||
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
|
||
+ + 'for Unicode {:s}.\n'.format(unicode_version))
|
||
+ translit_file.write('% The replacements have been surrounded ')
|
||
+ translit_file.write('with spaces, because fractions are\n')
|
||
+ translit_file.write('% often preceded by a decimal number and ')
|
||
+ translit_file.write('followed by a unit or a math symbol.\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('LC_CTYPE\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('translit_start\n')
|
||
+
|
||
+def output_tail(translit_file, tail=''):
|
||
+ '''Write the tail of the output file'''
|
||
+ if ARGS.input_file and tail:
|
||
+ translit_file.write(tail)
|
||
+ else:
|
||
+ translit_file.write('translit_end\n')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('END LC_CTYPE\n')
|
||
+
|
||
+def special_decompose(code_point_list):
|
||
+ '''
|
||
+ Decompositions which are not in UnicodeData.txt at all but which
|
||
+ were used in the original translit_fraction file in glibc and
|
||
+ which seem to make sense. I want to keep the update of
|
||
+ translit_fraction close to the spirit of the original file,
|
||
+ therefore I added this special decomposition rules here.
|
||
+ '''
|
||
+ special_decompose_dict = {
|
||
+ (0x2044,): [0x002F], # ⁄ → /
|
||
+ }
|
||
+ if tuple(code_point_list) in special_decompose_dict:
|
||
+ return special_decompose_dict[tuple(code_point_list)]
|
||
+ else:
|
||
+ return code_point_list
|
||
+
|
||
+def output_transliteration(translit_file):
|
||
+ '''Write the new transliteration to the output file'''
|
||
+ translit_file.write('\n')
|
||
+ for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
|
||
+ name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ decomposition = unicode_utils.UNICODE_ATTRIBUTES[
|
||
+ code_point]['decomposition']
|
||
+ if decomposition.startswith('<fraction>'):
|
||
+ decomposition = decomposition[11:]
|
||
+ decomposed_code_points = [[int(x, 16)
|
||
+ for x in decomposition.split(' ')]]
|
||
+ if decomposed_code_points[0]:
|
||
+ decomposed_code_points[0] = [0x0020] \
|
||
+ + decomposed_code_points[0] \
|
||
+ + [0x0020]
|
||
+ while True:
|
||
+ special_decomposed_code_points = special_decompose(
|
||
+ decomposed_code_points[-1])
|
||
+ if (special_decomposed_code_points
|
||
+ != decomposed_code_points[-1]):
|
||
+ decomposed_code_points.append(
|
||
+ special_decomposed_code_points)
|
||
+ continue
|
||
+ special_decomposed_code_points = []
|
||
+ for decomposed_code_point in decomposed_code_points[-1]:
|
||
+ special_decomposed_code_points += special_decompose(
|
||
+ [decomposed_code_point])
|
||
+ if (special_decomposed_code_points
|
||
+ == decomposed_code_points[-1]):
|
||
+ break
|
||
+ decomposed_code_points.append(
|
||
+ special_decomposed_code_points)
|
||
+ translit_file.write('% {:s}\n'.format(name))
|
||
+ translit_file.write('{:s} '.format(
|
||
+ unicode_utils.ucs_symbol(code_point)))
|
||
+ for index in range(0, len(decomposed_code_points)):
|
||
+ if index > 0:
|
||
+ translit_file.write(';')
|
||
+ if len(decomposed_code_points[index]) > 1:
|
||
+ translit_file.write('"')
|
||
+ for decomposed_code_point in decomposed_code_points[index]:
|
||
+ translit_file.write('{:s}'.format(
|
||
+ unicode_utils.ucs_symbol(decomposed_code_point)))
|
||
+ if len(decomposed_code_points[index]) > 1:
|
||
+ translit_file.write('"')
|
||
+ translit_file.write('\n')
|
||
+ translit_file.write('\n')
|
||
+
|
||
+if __name__ == "__main__":
|
||
+ PARSER = argparse.ArgumentParser(
|
||
+ description='''
|
||
+ Generate a translit_cjk_compat file from UnicodeData.txt.
|
||
+ ''')
|
||
+ PARSER.add_argument(
|
||
+ '-u', '--unicode_data_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ default='UnicodeData.txt',
|
||
+ help=('The UnicodeData.txt file to read, '
|
||
+ + 'default: %(default)s'))
|
||
+ PARSER.add_argument(
|
||
+ '-i', '--input_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ help=''' The original glibc/localedata/locales/translit_fraction
|
||
+ file.''')
|
||
+ PARSER.add_argument(
|
||
+ '-o', '--output_file',
|
||
+ nargs='?',
|
||
+ type=str,
|
||
+ default='translit_fraction.new',
|
||
+ help='''The new translit_fraction file, default: %(default)s. If the
|
||
+ original glibc/localedata/locales/translit_fraction file has
|
||
+ been given as an option, the header up to the
|
||
+ “translit_start” line and the tail from the “translit_end”
|
||
+ line to the end of the file will be copied unchanged into the
|
||
+ output file. ''')
|
||
+ PARSER.add_argument(
|
||
+ '--unicode_version',
|
||
+ nargs='?',
|
||
+ required=True,
|
||
+ type=str,
|
||
+ help='The Unicode version of the input files used.')
|
||
+ ARGS = PARSER.parse_args()
|
||
+
|
||
+ unicode_utils.fill_attributes(ARGS.unicode_data_file)
|
||
+ HEAD = TAIL = ''
|
||
+ if ARGS.input_file:
|
||
+ (HEAD, TAIL) = read_input_file(ARGS.input_file)
|
||
+ with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
|
||
+ output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
|
||
+ output_transliteration(TRANSLIT_FILE)
|
||
+ output_tail(TRANSLIT_FILE, tail=TAIL)
|
||
diff --git a/localedata/unicode-gen/gen_unicode_ctype.py b/localedata/unicode-gen/gen_unicode_ctype.py
|
||
index 0c74f2a..0f064f5 100755
|
||
--- a/localedata/unicode-gen/gen_unicode_ctype.py
|
||
+++ b/localedata/unicode-gen/gen_unicode_ctype.py
|
||
@@ -30,345 +30,9 @@ To see how this script is used, call it with the “-h” option:
|
||
'''
|
||
|
||
import argparse
|
||
-import sys
|
||
import time
|
||
import re
|
||
-
|
||
-# Dictionary holding the entire contents of the UnicodeData.txt file
|
||
-#
|
||
-# Contents of this dictionary look like this:
|
||
-#
|
||
-# {0: {'category': 'Cc',
|
||
-# 'title': None,
|
||
-# 'digit': '',
|
||
-# 'name': '<control>',
|
||
-# 'bidi': 'BN',
|
||
-# 'combining': '0',
|
||
-# 'comment': '',
|
||
-# 'oldname': 'NULL',
|
||
-# 'decomposition': '',
|
||
-# 'upper': None,
|
||
-# 'mirrored': 'N',
|
||
-# 'lower': None,
|
||
-# 'decdigit': '',
|
||
-# 'numeric': ''},
|
||
-# …
|
||
-# }
|
||
-UNICODE_ATTRIBUTES = {}
|
||
-
|
||
-# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
|
||
-#
|
||
-# Contents of this dictionary look like this:
|
||
-#
|
||
-# {917504: ['Default_Ignorable_Code_Point'],
|
||
-# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
|
||
-# …
|
||
-# }
|
||
-DERIVED_CORE_PROPERTIES = {}
|
||
-
|
||
-def fill_attribute(code_point, fields):
|
||
- '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
|
||
-
|
||
- One entry in the UNICODE_ATTRIBUTES dictionary represents one line
|
||
- in the UnicodeData.txt file.
|
||
-
|
||
- '''
|
||
- UNICODE_ATTRIBUTES[code_point] = {
|
||
- 'name': fields[1], # Character name
|
||
- 'category': fields[2], # General category
|
||
- 'combining': fields[3], # Canonical combining classes
|
||
- 'bidi': fields[4], # Bidirectional category
|
||
- 'decomposition': fields[5], # Character decomposition mapping
|
||
- 'decdigit': fields[6], # Decimal digit value
|
||
- 'digit': fields[7], # Digit value
|
||
- 'numeric': fields[8], # Numeric value
|
||
- 'mirrored': fields[9], # mirrored
|
||
- 'oldname': fields[10], # Old Unicode 1.0 name
|
||
- 'comment': fields[11], # comment
|
||
- # Uppercase mapping
|
||
- 'upper': int(fields[12], 16) if fields[12] else None,
|
||
- # Lowercase mapping
|
||
- 'lower': int(fields[13], 16) if fields[13] else None,
|
||
- # Titlecase mapping
|
||
- 'title': int(fields[14], 16) if fields[14] else None,
|
||
- }
|
||
-
|
||
-def fill_attributes(filename):
|
||
- '''Stores the entire contents of the UnicodeData.txt file
|
||
- in the UNICODE_ATTRIBUTES dictionary.
|
||
-
|
||
- A typical line for a single code point in UnicodeData.txt looks
|
||
- like this:
|
||
-
|
||
- 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
|
||
-
|
||
- Code point ranges are indicated by pairs of lines like this:
|
||
-
|
||
- 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
|
||
- 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
|
||
- '''
|
||
- with open(filename, mode='r') as unicode_data_file:
|
||
- fields_start = []
|
||
- for line in unicode_data_file:
|
||
- fields = line.strip().split(';')
|
||
- if len(fields) != 15:
|
||
- sys.stderr.write(
|
||
- 'short line in file "%(f)s": %(l)s\n' %{
|
||
- 'f': filename, 'l': line})
|
||
- exit(1)
|
||
- if fields[2] == 'Cs':
|
||
- # Surrogates are UTF-16 artefacts,
|
||
- # not real characters. Ignore them.
|
||
- fields_start = []
|
||
- continue
|
||
- if fields[1].endswith(', First>'):
|
||
- fields_start = fields
|
||
- fields_start[1] = fields_start[1].split(',')[0][1:]
|
||
- continue
|
||
- if fields[1].endswith(', Last>'):
|
||
- fields[1] = fields[1].split(',')[0][1:]
|
||
- if fields[1:] != fields_start[1:]:
|
||
- sys.stderr.write(
|
||
- 'broken code point range in file "%(f)s": %(l)s\n' %{
|
||
- 'f': filename, 'l': line})
|
||
- exit(1)
|
||
- for code_point in range(
|
||
- int(fields_start[0], 16),
|
||
- int(fields[0], 16)+1):
|
||
- fill_attribute(code_point, fields)
|
||
- fields_start = []
|
||
- continue
|
||
- fill_attribute(int(fields[0], 16), fields)
|
||
- fields_start = []
|
||
-
|
||
-def fill_derived_core_properties(filename):
|
||
- '''Stores the entire contents of the DerivedCoreProperties.txt file
|
||
- in the DERIVED_CORE_PROPERTIES dictionary.
|
||
-
|
||
- Lines in DerivedCoreProperties.txt are either a code point range like
|
||
- this:
|
||
-
|
||
- 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
|
||
-
|
||
- or a single code point like this:
|
||
-
|
||
- 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
|
||
-
|
||
- '''
|
||
- with open(filename, mode='r') as derived_core_properties_file:
|
||
- for line in derived_core_properties_file:
|
||
- match = re.match(
|
||
- r'^(?P<codepoint1>[0-9A-F]{4,6})'
|
||
- + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
|
||
- + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
|
||
- line)
|
||
- if not match:
|
||
- continue
|
||
- start = match.group('codepoint1')
|
||
- end = match.group('codepoint2')
|
||
- if not end:
|
||
- end = start
|
||
- for code_point in range(int(start, 16), int(end, 16)+1):
|
||
- prop = match.group('property')
|
||
- if code_point in DERIVED_CORE_PROPERTIES:
|
||
- DERIVED_CORE_PROPERTIES[code_point].append(prop)
|
||
- else:
|
||
- DERIVED_CORE_PROPERTIES[code_point] = [prop]
|
||
-
|
||
-def to_upper(code_point):
|
||
- '''Returns the code point of the uppercase version
|
||
- of the given code point'''
|
||
- if (UNICODE_ATTRIBUTES[code_point]['name']
|
||
- and UNICODE_ATTRIBUTES[code_point]['upper']):
|
||
- return UNICODE_ATTRIBUTES[code_point]['upper']
|
||
- else:
|
||
- return code_point
|
||
-
|
||
-def to_lower(code_point):
|
||
- '''Returns the code point of the lowercase version
|
||
- of the given code point'''
|
||
- if (UNICODE_ATTRIBUTES[code_point]['name']
|
||
- and UNICODE_ATTRIBUTES[code_point]['lower']):
|
||
- return UNICODE_ATTRIBUTES[code_point]['lower']
|
||
- else:
|
||
- return code_point
|
||
-
|
||
-def to_title(code_point):
|
||
- '''Returns the code point of the titlecase version
|
||
- of the given code point'''
|
||
- if (UNICODE_ATTRIBUTES[code_point]['name']
|
||
- and UNICODE_ATTRIBUTES[code_point]['title']):
|
||
- return UNICODE_ATTRIBUTES[code_point]['title']
|
||
- else:
|
||
- return code_point
|
||
-
|
||
-def is_upper(code_point):
|
||
- '''Checks whether the character with this code point is uppercase'''
|
||
- return (to_lower(code_point) != code_point
|
||
- or (code_point in DERIVED_CORE_PROPERTIES
|
||
- and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
|
||
-
|
||
-def is_lower(code_point):
|
||
- '''Checks whether the character with this code point is lowercase'''
|
||
- # Some characters are defined as “Lowercase” in
|
||
- # DerivedCoreProperties.txt but do not have a mapping to upper
|
||
- # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
|
||
- # one of these.
|
||
- return (to_upper(code_point) != code_point
|
||
- # <U00DF> is lowercase, but without simple to_upper mapping.
|
||
- or code_point == 0x00DF
|
||
- or (code_point in DERIVED_CORE_PROPERTIES
|
||
- and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
|
||
-
|
||
-def is_alpha(code_point):
|
||
- '''Checks whether the character with this code point is alphabetic'''
|
||
- return ((code_point in DERIVED_CORE_PROPERTIES
|
||
- and
|
||
- 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
|
||
- or
|
||
- # Consider all the non-ASCII digits as alphabetic.
|
||
- # ISO C 99 forbids us to have them in category “digit”,
|
||
- # but we want iswalnum to return true on them.
|
||
- (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
|
||
- and not (code_point >= 0x0030 and code_point <= 0x0039)))
|
||
-
|
||
-def is_digit(code_point):
|
||
- '''Checks whether the character with this code point is a digit'''
|
||
- if False:
|
||
- return (UNICODE_ATTRIBUTES[code_point]['name']
|
||
- and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
|
||
- # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
|
||
- # a zero. Must add <0> in front of them by hand.
|
||
- else:
|
||
- # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
|
||
- # takes it away:
|
||
- # 7.25.2.1.5:
|
||
- # The iswdigit function tests for any wide character that
|
||
- # corresponds to a decimal-digit character (as defined in 5.2.1).
|
||
- # 5.2.1:
|
||
- # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
|
||
- return (code_point >= 0x0030 and code_point <= 0x0039)
|
||
-
|
||
-def is_outdigit(code_point):
|
||
- '''Checks whether the character with this code point is outdigit'''
|
||
- return (code_point >= 0x0030 and code_point <= 0x0039)
|
||
-
|
||
-def is_blank(code_point):
|
||
- '''Checks whether the character with this code point is blank'''
|
||
- return (code_point == 0x0009 # '\t'
|
||
- # Category Zs without mention of '<noBreak>'
|
||
- or (UNICODE_ATTRIBUTES[code_point]['name']
|
||
- and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
|
||
- and '<noBreak>' not in
|
||
- UNICODE_ATTRIBUTES[code_point]['decomposition']))
|
||
-
|
||
-def is_space(code_point):
|
||
- '''Checks whether the character with this code point is a space'''
|
||
- # Don’t make U+00A0 a space. Non-breaking space means that all programs
|
||
- # should treat it like a punctuation character, not like a space.
|
||
- return (code_point == 0x0020 # ' '
|
||
- or code_point == 0x000C # '\f'
|
||
- or code_point == 0x000A # '\n'
|
||
- or code_point == 0x000D # '\r'
|
||
- or code_point == 0x0009 # '\t'
|
||
- or code_point == 0x000B # '\v'
|
||
- # Categories Zl, Zp, and Zs without mention of "<noBreak>"
|
||
- or (UNICODE_ATTRIBUTES[code_point]['name']
|
||
- and
|
||
- (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
|
||
- or
|
||
- (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
|
||
- and
|
||
- '<noBreak>' not in
|
||
- UNICODE_ATTRIBUTES[code_point]['decomposition']))))
|
||
-
|
||
-def is_cntrl(code_point):
|
||
- '''Checks whether the character with this code point is
|
||
- a control character'''
|
||
- return (UNICODE_ATTRIBUTES[code_point]['name']
|
||
- and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
|
||
- or
|
||
- UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
|
||
-
|
||
-def is_xdigit(code_point):
|
||
- '''Checks whether the character with this code point is
|
||
- a hexadecimal digit'''
|
||
- if False:
|
||
- return (is_digit(code_point)
|
||
- or (code_point >= 0x0041 and code_point <= 0x0046)
|
||
- or (code_point >= 0x0061 and code_point <= 0x0066))
|
||
- else:
|
||
- # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
|
||
- # takes it away:
|
||
- # 7.25.2.1.12:
|
||
- # The iswxdigit function tests for any wide character that
|
||
- # corresponds to a hexadecimal-digit character (as defined
|
||
- # in 6.4.4.1).
|
||
- # 6.4.4.1:
|
||
- # hexadecimal-digit: one of
|
||
- # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
|
||
- return ((code_point >= 0x0030 and code_point <= 0x0039)
|
||
- or (code_point >= 0x0041 and code_point <= 0x0046)
|
||
- or (code_point >= 0x0061 and code_point <= 0x0066))
|
||
-
|
||
-def is_graph(code_point):
|
||
- '''Checks whether the character with this code point is
|
||
- a graphical character'''
|
||
- return (UNICODE_ATTRIBUTES[code_point]['name']
|
||
- and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
|
||
- and not is_space(code_point))
|
||
-
|
||
-def is_print(code_point):
|
||
- '''Checks whether the character with this code point is printable'''
|
||
- return (UNICODE_ATTRIBUTES[code_point]['name']
|
||
- and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
|
||
- and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
|
||
-
|
||
-def is_punct(code_point):
|
||
- '''Checks whether the character with this code point is punctuation'''
|
||
- if False:
|
||
- return (UNICODE_ATTRIBUTES[code_point]['name']
|
||
- and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
|
||
- else:
|
||
- # The traditional POSIX definition of punctuation is every graphic,
|
||
- # non-alphanumeric character.
|
||
- return (is_graph(code_point)
|
||
- and not is_alpha(code_point)
|
||
- and not is_digit(code_point))
|
||
-
|
||
-def is_combining(code_point):
|
||
- '''Checks whether the character with this code point is
|
||
- a combining character'''
|
||
- # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
|
||
- # file. In 3.0.1 it was identical to the union of the general categories
|
||
- # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
|
||
- # PropList.txt file, so we take the latter definition.
|
||
- return (UNICODE_ATTRIBUTES[code_point]['name']
|
||
- and
|
||
- UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
|
||
-
|
||
-def is_combining_level3(code_point):
|
||
- '''Checks whether the character with this code point is
|
||
- a combining level3 character'''
|
||
- return (is_combining(code_point)
|
||
- and
|
||
- int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
|
||
-
|
||
-def ucs_symbol(code_point):
|
||
- '''Return the UCS symbol string for a Unicode character.'''
|
||
- if code_point < 0x10000:
|
||
- return '<U{:04X}>'.format(code_point)
|
||
- else:
|
||
- return '<U{:08X}>'.format(code_point)
|
||
-
|
||
-def ucs_symbol_range(code_point_low, code_point_high):
|
||
- '''Returns a string UCS symbol string for a code point range.
|
||
-
|
||
- Example:
|
||
-
|
||
- <U0041>..<U005A>
|
||
- '''
|
||
- return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
|
||
+import unicode_utils
|
||
|
||
def code_point_ranges(is_class_function):
|
||
'''Returns a list of ranges of code points for which is_class_function
|
||
@@ -379,7 +43,7 @@ def code_point_ranges(is_class_function):
|
||
[[65, 90], [192, 214], [216, 222], [256], … ]
|
||
'''
|
||
cp_ranges = []
|
||
- for code_point in sorted(UNICODE_ATTRIBUTES):
|
||
+ for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
|
||
if is_class_function(code_point):
|
||
if (cp_ranges
|
||
and cp_ranges[-1][-1] == code_point - 1):
|
||
@@ -413,9 +77,9 @@ def output_charclass(i18n_file, class_name, is_class_function):
|
||
if line.strip():
|
||
line += ';'
|
||
if len(code_point_range) == 1:
|
||
- range_string = ucs_symbol(code_point_range[0])
|
||
+ range_string = unicode_utils.ucs_symbol(code_point_range[0])
|
||
else:
|
||
- range_string = ucs_symbol_range(
|
||
+ range_string = unicode_utils.ucs_symbol_range(
|
||
code_point_range[0], code_point_range[-1])
|
||
if len(line+range_string) > max_column:
|
||
i18n_file.write(line+'/\n')
|
||
@@ -441,15 +105,15 @@ def output_charmap(i18n_file, map_name, map_function):
|
||
line = prefix
|
||
map_string = ''
|
||
i18n_file.write('%s /\n' %map_name)
|
||
- for code_point in sorted(UNICODE_ATTRIBUTES):
|
||
+ for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
|
||
mapped = map_function(code_point)
|
||
if code_point != mapped:
|
||
if line.strip():
|
||
line += ';'
|
||
map_string = '(' \
|
||
- + ucs_symbol(code_point) \
|
||
+ + unicode_utils.ucs_symbol(code_point) \
|
||
+ ',' \
|
||
- + ucs_symbol(mapped) \
|
||
+ + unicode_utils.ucs_symbol(mapped) \
|
||
+ ')'
|
||
if len(line+map_string) > max_column:
|
||
i18n_file.write(line+'/\n')
|
||
@@ -459,110 +123,6 @@ def output_charmap(i18n_file, map_name, map_function):
|
||
i18n_file.write(line+'\n')
|
||
i18n_file.write('\n')
|
||
|
||
-def verifications():
|
||
- '''Tests whether the is_* functions observe the known restrictions'''
|
||
- for code_point in sorted(UNICODE_ATTRIBUTES):
|
||
- # toupper restriction: "Only characters specified for the keywords
|
||
- # lower and upper shall be specified.
|
||
- if (to_upper(code_point) != code_point
|
||
- and not (is_lower(code_point) or is_upper(code_point))):
|
||
- sys.stderr.write(
|
||
- ('%(sym)s is not upper|lower '
|
||
- + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
|
||
- 'sym': ucs_symbol(code_point),
|
||
- 'c': code_point,
|
||
- 'uc': to_upper(code_point)})
|
||
- # tolower restriction: "Only characters specified for the keywords
|
||
- # lower and upper shall be specified.
|
||
- if (to_lower(code_point) != code_point
|
||
- and not (is_lower(code_point) or is_upper(code_point))):
|
||
- sys.stderr.write(
|
||
- ('%(sym)s is not upper|lower '
|
||
- + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
|
||
- 'sym': ucs_symbol(code_point),
|
||
- 'c': code_point,
|
||
- 'uc': to_lower(code_point)})
|
||
- # alpha restriction: "Characters classified as either upper or lower
|
||
- # shall automatically belong to this class.
|
||
- if ((is_lower(code_point) or is_upper(code_point))
|
||
- and not is_alpha(code_point)):
|
||
- sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- # alpha restriction: “No character specified for the keywords cntrl,
|
||
- # digit, punct or space shall be specified.”
|
||
- if (is_alpha(code_point) and is_cntrl(code_point)):
|
||
- sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- if (is_alpha(code_point) and is_digit(code_point)):
|
||
- sys.stderr.write('%(sym)s is alpha and digit\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- if (is_alpha(code_point) and is_punct(code_point)):
|
||
- sys.stderr.write('%(sym)s is alpha and punct\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- if (is_alpha(code_point) and is_space(code_point)):
|
||
- sys.stderr.write('%(sym)s is alpha and space\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- # space restriction: “No character specified for the keywords upper,
|
||
- # lower, alpha, digit, graph or xdigit shall be specified.”
|
||
- # upper, lower, alpha already checked above.
|
||
- if (is_space(code_point) and is_digit(code_point)):
|
||
- sys.stderr.write('%(sym)s is space and digit\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- if (is_space(code_point) and is_graph(code_point)):
|
||
- sys.stderr.write('%(sym)s is space and graph\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- if (is_space(code_point) and is_xdigit(code_point)):
|
||
- sys.stderr.write('%(sym)s is space and xdigit\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- # cntrl restriction: “No character specified for the keywords upper,
|
||
- # lower, alpha, digit, punct, graph, print or xdigit shall be
|
||
- # specified.” upper, lower, alpha already checked above.
|
||
- if (is_cntrl(code_point) and is_digit(code_point)):
|
||
- sys.stderr.write('%(sym)s is cntrl and digit\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- if (is_cntrl(code_point) and is_punct(code_point)):
|
||
- sys.stderr.write('%(sym)s is cntrl and punct\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- if (is_cntrl(code_point) and is_graph(code_point)):
|
||
- sys.stderr.write('%(sym)s is cntrl and graph\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- if (is_cntrl(code_point) and is_print(code_point)):
|
||
- sys.stderr.write('%(sym)s is cntrl and print\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- if (is_cntrl(code_point) and is_xdigit(code_point)):
|
||
- sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- # punct restriction: “No character specified for the keywords upper,
|
||
- # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
|
||
- # be specified.” upper, lower, alpha, cntrl already checked above.
|
||
- if (is_punct(code_point) and is_digit(code_point)):
|
||
- sys.stderr.write('%(sym)s is punct and digit\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- if (is_punct(code_point) and is_xdigit(code_point)):
|
||
- sys.stderr.write('%(sym)s is punct and xdigit\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- if (is_punct(code_point) and code_point == 0x0020):
|
||
- sys.stderr.write('%(sym)s is punct\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- # graph restriction: “No character specified for the keyword cntrl
|
||
- # shall be specified.” Already checked above.
|
||
-
|
||
- # print restriction: “No character specified for the keyword cntrl
|
||
- # shall be specified.” Already checked above.
|
||
-
|
||
- # graph - print relation: differ only in the <space> character.
|
||
- # How is this possible if there are more than one space character?!
|
||
- # I think susv2/xbd/locale.html should speak of “space characters”,
|
||
- # not “space character”.
|
||
- if (is_print(code_point)
|
||
- and not (is_graph(code_point) or is_space(code_point))):
|
||
- sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
- if (not is_print(code_point)
|
||
- and (is_graph(code_point) or code_point == 0x0020)):
|
||
- sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
|
||
- 'sym': ucs_symbol(code_point)})
|
||
-
|
||
def read_input_file(filename):
|
||
'''Reads the original glibc i18n file to get the original head
|
||
and tail.
|
||
@@ -648,18 +208,18 @@ def output_tables(i18n_file, unicode_version):
|
||
+ 'program.\n\n')
|
||
i18n_file.write('% The "upper" class reflects the uppercase '
|
||
+ 'characters of class "alpha"\n')
|
||
- output_charclass(i18n_file, 'upper', is_upper)
|
||
+ output_charclass(i18n_file, 'upper', unicode_utils.is_upper)
|
||
i18n_file.write('% The "lower" class reflects the lowercase '
|
||
+ 'characters of class "alpha"\n')
|
||
- output_charclass(i18n_file, 'lower', is_lower)
|
||
+ output_charclass(i18n_file, 'lower', unicode_utils.is_lower)
|
||
i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
|
||
+ 'reflecting\n')
|
||
i18n_file.write('% the recommendations in TR 10176 annex A\n')
|
||
- output_charclass(i18n_file, 'alpha', is_alpha)
|
||
+ output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha)
|
||
i18n_file.write('% The "digit" class must only contain the '
|
||
+ 'BASIC LATIN digits, says ISO C 99\n')
|
||
i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
|
||
- output_charclass(i18n_file, 'digit', is_digit)
|
||
+ output_charclass(i18n_file, 'digit', unicode_utils.is_digit)
|
||
i18n_file.write('% The "outdigit" information is by default '
|
||
+ '"0" to "9". We don\'t have to\n')
|
||
i18n_file.write('% provide it here since localedef will fill '
|
||
@@ -669,29 +229,30 @@ def output_tables(i18n_file, unicode_version):
|
||
i18n_file.write('% outdigit /\n')
|
||
i18n_file.write('% <U0030>..<U0039>\n\n')
|
||
# output_charclass(i18n_file, 'outdigit', is_outdigit)
|
||
- output_charclass(i18n_file, 'space', is_space)
|
||
- output_charclass(i18n_file, 'cntrl', is_cntrl)
|
||
- output_charclass(i18n_file, 'punct', is_punct)
|
||
- output_charclass(i18n_file, 'graph', is_graph)
|
||
- output_charclass(i18n_file, 'print', is_print)
|
||
+ output_charclass(i18n_file, 'space', unicode_utils.is_space)
|
||
+ output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl)
|
||
+ output_charclass(i18n_file, 'punct', unicode_utils.is_punct)
|
||
+ output_charclass(i18n_file, 'graph', unicode_utils.is_graph)
|
||
+ output_charclass(i18n_file, 'print', unicode_utils.is_print)
|
||
i18n_file.write('% The "xdigit" class must only contain the '
|
||
+ 'BASIC LATIN digits and A-F, a-f,\n')
|
||
i18n_file.write('% says ISO C 99 '
|
||
+ '(sections 7.25.2.1.12 and 6.4.4.1).\n')
|
||
- output_charclass(i18n_file, 'xdigit', is_xdigit)
|
||
- output_charclass(i18n_file, 'blank', is_blank)
|
||
- output_charmap(i18n_file, 'toupper', to_upper)
|
||
- output_charmap(i18n_file, 'tolower', to_lower)
|
||
- output_charmap(i18n_file, 'map "totitle";', to_title)
|
||
+ output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit)
|
||
+ output_charclass(i18n_file, 'blank', unicode_utils.is_blank)
|
||
+ output_charmap(i18n_file, 'toupper', unicode_utils.to_upper)
|
||
+ output_charmap(i18n_file, 'tolower', unicode_utils.to_lower)
|
||
+ output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title)
|
||
i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
|
||
+ 'annex B.1\n')
|
||
i18n_file.write('% That is, all combining characters (level 2+3).\n')
|
||
- output_charclass(i18n_file, 'class "combining";', is_combining)
|
||
+ output_charclass(i18n_file, 'class "combining";',
|
||
+ unicode_utils.is_combining)
|
||
i18n_file.write('% The "combining_level3" class reflects '
|
||
+ 'ISO/IEC 10646-1 annex B.2\n')
|
||
i18n_file.write('% That is, combining characters of level 3.\n')
|
||
- output_charclass(i18n_file,
|
||
- 'class "combining_level3";', is_combining_level3)
|
||
+ output_charclass(i18n_file, 'class "combining_level3";',
|
||
+ unicode_utils.is_combining_level3)
|
||
|
||
if __name__ == "__main__":
|
||
PARSER = argparse.ArgumentParser(
|
||
@@ -739,9 +300,11 @@ if __name__ == "__main__":
|
||
help='The Unicode version of the input files used.')
|
||
ARGS = PARSER.parse_args()
|
||
|
||
- fill_attributes(ARGS.unicode_data_file)
|
||
- fill_derived_core_properties(ARGS.derived_core_properties_file)
|
||
- verifications()
|
||
+ unicode_utils.fill_attributes(
|
||
+ ARGS.unicode_data_file)
|
||
+ unicode_utils.fill_derived_core_properties(
|
||
+ ARGS.derived_core_properties_file)
|
||
+ unicode_utils.verifications()
|
||
HEAD = TAIL = ''
|
||
if ARGS.input_file:
|
||
(HEAD, TAIL) = read_input_file(ARGS.input_file)
|
||
diff --git a/localedata/unicode-gen/unicode_utils.py b/localedata/unicode-gen/unicode_utils.py
|
||
new file mode 100644
|
||
index 0000000..ee91582
|
||
--- /dev/null
|
||
+++ b/localedata/unicode-gen/unicode_utils.py
|
||
@@ -0,0 +1,502 @@
|
||
+# Utilities to generate Unicode data for glibc from upstream Unicode data.
|
||
+#
|
||
+# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
|
||
+# This file is part of the GNU C Library.
|
||
+#
|
||
+# The GNU C Library is free software; you can redistribute it and/or
|
||
+# modify it under the terms of the GNU Lesser General Public
|
||
+# License as published by the Free Software Foundation; either
|
||
+# version 2.1 of the License, or (at your option) any later version.
|
||
+#
|
||
+# The GNU C Library is distributed in the hope that it will be useful,
|
||
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
+# Lesser General Public License for more details.
|
||
+#
|
||
+# You should have received a copy of the GNU Lesser General Public
|
||
+# License along with the GNU C Library; if not, see
|
||
+# <http://www.gnu.org/licenses/>.
|
||
+
|
||
+'''
|
||
+This module contains utilities used by the scripts to generate
|
||
+Unicode data for glibc from upstream Unicode data files.
|
||
+'''
|
||
+
|
||
+import sys
|
||
+import re
|
||
+
|
||
+# Dictionary holding the entire contents of the UnicodeData.txt file
|
||
+#
|
||
+# Contents of this dictionary look like this:
|
||
+#
|
||
+# {0: {'category': 'Cc',
|
||
+# 'title': None,
|
||
+# 'digit': '',
|
||
+# 'name': '<control>',
|
||
+# 'bidi': 'BN',
|
||
+# 'combining': '0',
|
||
+# 'comment': '',
|
||
+# 'oldname': 'NULL',
|
||
+# 'decomposition': '',
|
||
+# 'upper': None,
|
||
+# 'mirrored': 'N',
|
||
+# 'lower': None,
|
||
+# 'decdigit': '',
|
||
+# 'numeric': ''},
|
||
+# …
|
||
+# }
|
||
+UNICODE_ATTRIBUTES = {}
|
||
+
|
||
+# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
|
||
+#
|
||
+# Contents of this dictionary look like this:
|
||
+#
|
||
+# {917504: ['Default_Ignorable_Code_Point'],
|
||
+# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
|
||
+# …
|
||
+# }
|
||
+DERIVED_CORE_PROPERTIES = {}
|
||
+
|
||
+# Dictionary holding the entire contents of the EastAsianWidths.txt file
|
||
+#
|
||
+# Contents of this dictionary look like this:
|
||
+#
|
||
+# {0: 'N', … , 45430: 'W', …}
|
||
+EAST_ASIAN_WIDTHS = {}
|
||
+
|
||
+def fill_attribute(code_point, fields):
|
||
+ '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
|
||
+
|
||
+ One entry in the UNICODE_ATTRIBUTES dictionary represents one line
|
||
+ in the UnicodeData.txt file.
|
||
+
|
||
+ '''
|
||
+ UNICODE_ATTRIBUTES[code_point] = {
|
||
+ 'name': fields[1], # Character name
|
||
+ 'category': fields[2], # General category
|
||
+ 'combining': fields[3], # Canonical combining classes
|
||
+ 'bidi': fields[4], # Bidirectional category
|
||
+ 'decomposition': fields[5], # Character decomposition mapping
|
||
+ 'decdigit': fields[6], # Decimal digit value
|
||
+ 'digit': fields[7], # Digit value
|
||
+ 'numeric': fields[8], # Numeric value
|
||
+ 'mirrored': fields[9], # mirrored
|
||
+ 'oldname': fields[10], # Old Unicode 1.0 name
|
||
+ 'comment': fields[11], # comment
|
||
+ # Uppercase mapping
|
||
+ 'upper': int(fields[12], 16) if fields[12] else None,
|
||
+ # Lowercase mapping
|
||
+ 'lower': int(fields[13], 16) if fields[13] else None,
|
||
+ # Titlecase mapping
|
||
+ 'title': int(fields[14], 16) if fields[14] else None,
|
||
+ }
|
||
+
|
||
+def fill_attributes(filename):
|
||
+ '''Stores the entire contents of the UnicodeData.txt file
|
||
+ in the UNICODE_ATTRIBUTES dictionary.
|
||
+
|
||
+ A typical line for a single code point in UnicodeData.txt looks
|
||
+ like this:
|
||
+
|
||
+ 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
|
||
+
|
||
+ Code point ranges are indicated by pairs of lines like this:
|
||
+
|
||
+ 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
|
||
+ 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
|
||
+ '''
|
||
+ with open(filename, mode='r') as unicode_data_file:
|
||
+ fields_start = []
|
||
+ for line in unicode_data_file:
|
||
+ fields = line.strip().split(';')
|
||
+ if len(fields) != 15:
|
||
+ sys.stderr.write(
|
||
+ 'short line in file "%(f)s": %(l)s\n' %{
|
||
+ 'f': filename, 'l': line})
|
||
+ exit(1)
|
||
+ if fields[2] == 'Cs':
|
||
+ # Surrogates are UTF-16 artefacts,
|
||
+ # not real characters. Ignore them.
|
||
+ fields_start = []
|
||
+ continue
|
||
+ if fields[1].endswith(', First>'):
|
||
+ fields_start = fields
|
||
+ fields_start[1] = fields_start[1].split(',')[0][1:]
|
||
+ continue
|
||
+ if fields[1].endswith(', Last>'):
|
||
+ fields[1] = fields[1].split(',')[0][1:]
|
||
+ if fields[1:] != fields_start[1:]:
|
||
+ sys.stderr.write(
|
||
+ 'broken code point range in file "%(f)s": %(l)s\n' %{
|
||
+ 'f': filename, 'l': line})
|
||
+ exit(1)
|
||
+ for code_point in range(
|
||
+ int(fields_start[0], 16),
|
||
+ int(fields[0], 16)+1):
|
||
+ fill_attribute(code_point, fields)
|
||
+ fields_start = []
|
||
+ continue
|
||
+ fill_attribute(int(fields[0], 16), fields)
|
||
+ fields_start = []
|
||
+
|
||
+def fill_derived_core_properties(filename):
|
||
+ '''Stores the entire contents of the DerivedCoreProperties.txt file
|
||
+ in the DERIVED_CORE_PROPERTIES dictionary.
|
||
+
|
||
+ Lines in DerivedCoreProperties.txt are either a code point range like
|
||
+ this:
|
||
+
|
||
+ 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
|
||
+
|
||
+ or a single code point like this:
|
||
+
|
||
+ 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
|
||
+
|
||
+ '''
|
||
+ with open(filename, mode='r') as derived_core_properties_file:
|
||
+ for line in derived_core_properties_file:
|
||
+ match = re.match(
|
||
+ r'^(?P<codepoint1>[0-9A-F]{4,6})'
|
||
+ + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
|
||
+ + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
|
||
+ line)
|
||
+ if not match:
|
||
+ continue
|
||
+ start = match.group('codepoint1')
|
||
+ end = match.group('codepoint2')
|
||
+ if not end:
|
||
+ end = start
|
||
+ for code_point in range(int(start, 16), int(end, 16)+1):
|
||
+ prop = match.group('property')
|
||
+ if code_point in DERIVED_CORE_PROPERTIES:
|
||
+ DERIVED_CORE_PROPERTIES[code_point].append(prop)
|
||
+ else:
|
||
+ DERIVED_CORE_PROPERTIES[code_point] = [prop]
|
||
+
|
||
+def fill_east_asian_widths(filename):
|
||
+ '''Stores the entire contents of the EastAsianWidths.txt file
|
||
+ in the EAST_ASIAN_WIDTHS dictionary.
|
||
+
|
||
+ Lines in EastAsianWidths.txt are either a code point range like
|
||
+ this:
|
||
+
|
||
+ 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
|
||
+
|
||
+ or a single code point like this:
|
||
+
|
||
+ A015;W # Lm YI SYLLABLE WU
|
||
+ '''
|
||
+ with open(filename, mode='r') as east_asian_widths_file:
|
||
+ for line in east_asian_widths_file:
|
||
+ match = re.match(
|
||
+ r'^(?P<codepoint1>[0-9A-F]{4,6})'
|
||
+ +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
|
||
+ +r'\s*;\s*(?P<property>[a-zA-Z]+)',
|
||
+ line)
|
||
+ if not match:
|
||
+ continue
|
||
+ start = match.group('codepoint1')
|
||
+ end = match.group('codepoint2')
|
||
+ if not end:
|
||
+ end = start
|
||
+ for code_point in range(int(start, 16), int(end, 16)+1):
|
||
+ EAST_ASIAN_WIDTHS[code_point] = match.group('property')
|
||
+
|
||
+def to_upper(code_point):
|
||
+ '''Returns the code point of the uppercase version
|
||
+ of the given code point'''
|
||
+ if (UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ and UNICODE_ATTRIBUTES[code_point]['upper']):
|
||
+ return UNICODE_ATTRIBUTES[code_point]['upper']
|
||
+ else:
|
||
+ return code_point
|
||
+
|
||
+def to_lower(code_point):
|
||
+ '''Returns the code point of the lowercase version
|
||
+ of the given code point'''
|
||
+ if (UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ and UNICODE_ATTRIBUTES[code_point]['lower']):
|
||
+ return UNICODE_ATTRIBUTES[code_point]['lower']
|
||
+ else:
|
||
+ return code_point
|
||
+
|
||
+def to_title(code_point):
|
||
+ '''Returns the code point of the titlecase version
|
||
+ of the given code point'''
|
||
+ if (UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ and UNICODE_ATTRIBUTES[code_point]['title']):
|
||
+ return UNICODE_ATTRIBUTES[code_point]['title']
|
||
+ else:
|
||
+ return code_point
|
||
+
|
||
+def is_upper(code_point):
|
||
+ '''Checks whether the character with this code point is uppercase'''
|
||
+ return (to_lower(code_point) != code_point
|
||
+ or (code_point in DERIVED_CORE_PROPERTIES
|
||
+ and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
|
||
+
|
||
+def is_lower(code_point):
|
||
+ '''Checks whether the character with this code point is lowercase'''
|
||
+ # Some characters are defined as “Lowercase” in
|
||
+ # DerivedCoreProperties.txt but do not have a mapping to upper
|
||
+ # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
|
||
+ # one of these.
|
||
+ return (to_upper(code_point) != code_point
|
||
+ # <U00DF> is lowercase, but without simple to_upper mapping.
|
||
+ or code_point == 0x00DF
|
||
+ or (code_point in DERIVED_CORE_PROPERTIES
|
||
+ and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
|
||
+
|
||
+def is_alpha(code_point):
|
||
+ '''Checks whether the character with this code point is alphabetic'''
|
||
+ return ((code_point in DERIVED_CORE_PROPERTIES
|
||
+ and
|
||
+ 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
|
||
+ or
|
||
+ # Consider all the non-ASCII digits as alphabetic.
|
||
+ # ISO C 99 forbids us to have them in category “digit”,
|
||
+ # but we want iswalnum to return true on them.
|
||
+ (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
|
||
+ and not (code_point >= 0x0030 and code_point <= 0x0039)))
|
||
+
|
||
+def is_digit(code_point):
|
||
+ '''Checks whether the character with this code point is a digit'''
|
||
+ if False:
|
||
+ return (UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
|
||
+ # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
|
||
+ # a zero. Must add <0> in front of them by hand.
|
||
+ else:
|
||
+ # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
|
||
+ # takes it away:
|
||
+ # 7.25.2.1.5:
|
||
+ # The iswdigit function tests for any wide character that
|
||
+ # corresponds to a decimal-digit character (as defined in 5.2.1).
|
||
+ # 5.2.1:
|
||
+ # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
|
||
+ return (code_point >= 0x0030 and code_point <= 0x0039)
|
||
+
|
||
+def is_outdigit(code_point):
|
||
+ '''Checks whether the character with this code point is outdigit'''
|
||
+ return (code_point >= 0x0030 and code_point <= 0x0039)
|
||
+
|
||
+def is_blank(code_point):
|
||
+ '''Checks whether the character with this code point is blank'''
|
||
+ return (code_point == 0x0009 # '\t'
|
||
+ # Category Zs without mention of '<noBreak>'
|
||
+ or (UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
|
||
+ and '<noBreak>' not in
|
||
+ UNICODE_ATTRIBUTES[code_point]['decomposition']))
|
||
+
|
||
+def is_space(code_point):
|
||
+ '''Checks whether the character with this code point is a space'''
|
||
+ # Don’t make U+00A0 a space. Non-breaking space means that all programs
|
||
+ # should treat it like a punctuation character, not like a space.
|
||
+ return (code_point == 0x0020 # ' '
|
||
+ or code_point == 0x000C # '\f'
|
||
+ or code_point == 0x000A # '\n'
|
||
+ or code_point == 0x000D # '\r'
|
||
+ or code_point == 0x0009 # '\t'
|
||
+ or code_point == 0x000B # '\v'
|
||
+ # Categories Zl, Zp, and Zs without mention of "<noBreak>"
|
||
+ or (UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ and
|
||
+ (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
|
||
+ or
|
||
+ (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
|
||
+ and
|
||
+ '<noBreak>' not in
|
||
+ UNICODE_ATTRIBUTES[code_point]['decomposition']))))
|
||
+
|
||
+def is_cntrl(code_point):
|
||
+ '''Checks whether the character with this code point is
|
||
+ a control character'''
|
||
+ return (UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
|
||
+ or
|
||
+ UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
|
||
+
|
||
+def is_xdigit(code_point):
|
||
+ '''Checks whether the character with this code point is
|
||
+ a hexadecimal digit'''
|
||
+ if False:
|
||
+ return (is_digit(code_point)
|
||
+ or (code_point >= 0x0041 and code_point <= 0x0046)
|
||
+ or (code_point >= 0x0061 and code_point <= 0x0066))
|
||
+ else:
|
||
+ # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
|
||
+ # takes it away:
|
||
+ # 7.25.2.1.12:
|
||
+ # The iswxdigit function tests for any wide character that
|
||
+ # corresponds to a hexadecimal-digit character (as defined
|
||
+ # in 6.4.4.1).
|
||
+ # 6.4.4.1:
|
||
+ # hexadecimal-digit: one of
|
||
+ # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
|
||
+ return ((code_point >= 0x0030 and code_point <= 0x0039)
|
||
+ or (code_point >= 0x0041 and code_point <= 0x0046)
|
||
+ or (code_point >= 0x0061 and code_point <= 0x0066))
|
||
+
|
||
+def is_graph(code_point):
|
||
+ '''Checks whether the character with this code point is
|
||
+ a graphical character'''
|
||
+ return (UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
|
||
+ and not is_space(code_point))
|
||
+
|
||
+def is_print(code_point):
|
||
+ '''Checks whether the character with this code point is printable'''
|
||
+ return (UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
|
||
+ and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
|
||
+
|
||
+def is_punct(code_point):
|
||
+ '''Checks whether the character with this code point is punctuation'''
|
||
+ if False:
|
||
+ return (UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
|
||
+ else:
|
||
+ # The traditional POSIX definition of punctuation is every graphic,
|
||
+ # non-alphanumeric character.
|
||
+ return (is_graph(code_point)
|
||
+ and not is_alpha(code_point)
|
||
+ and not is_digit(code_point))
|
||
+
|
||
+def is_combining(code_point):
|
||
+ '''Checks whether the character with this code point is
|
||
+ a combining character'''
|
||
+ # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
|
||
+ # file. In 3.0.1 it was identical to the union of the general categories
|
||
+ # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
|
||
+ # PropList.txt file, so we take the latter definition.
|
||
+ return (UNICODE_ATTRIBUTES[code_point]['name']
|
||
+ and
|
||
+ UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
|
||
+
|
||
+def is_combining_level3(code_point):
|
||
+ '''Checks whether the character with this code point is
|
||
+ a combining level3 character'''
|
||
+ return (is_combining(code_point)
|
||
+ and
|
||
+ int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
|
||
+
|
||
+def ucs_symbol(code_point):
|
||
+ '''Return the UCS symbol string for a Unicode character.'''
|
||
+ if code_point < 0x10000:
|
||
+ return '<U{:04X}>'.format(code_point)
|
||
+ else:
|
||
+ return '<U{:08X}>'.format(code_point)
|
||
+
|
||
+def ucs_symbol_range(code_point_low, code_point_high):
|
||
+ '''Returns a string UCS symbol string for a code point range.
|
||
+
|
||
+ Example:
|
||
+
|
||
+ <U0041>..<U005A>
|
||
+ '''
|
||
+ return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
|
||
+
|
||
+def verifications():
|
||
+ '''Tests whether the is_* functions observe the known restrictions'''
|
||
+ for code_point in sorted(UNICODE_ATTRIBUTES):
|
||
+ # toupper restriction: "Only characters specified for the keywords
|
||
+ # lower and upper shall be specified.
|
||
+ if (to_upper(code_point) != code_point
|
||
+ and not (is_lower(code_point) or is_upper(code_point))):
|
||
+ sys.stderr.write(
|
||
+ ('%(sym)s is not upper|lower '
|
||
+ + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
|
||
+ 'sym': ucs_symbol(code_point),
|
||
+ 'c': code_point,
|
||
+ 'uc': to_upper(code_point)})
|
||
+ # tolower restriction: "Only characters specified for the keywords
|
||
+ # lower and upper shall be specified.
|
||
+ if (to_lower(code_point) != code_point
|
||
+ and not (is_lower(code_point) or is_upper(code_point))):
|
||
+ sys.stderr.write(
|
||
+ ('%(sym)s is not upper|lower '
|
||
+ + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
|
||
+ 'sym': ucs_symbol(code_point),
|
||
+ 'c': code_point,
|
||
+ 'uc': to_lower(code_point)})
|
||
+ # alpha restriction: "Characters classified as either upper or lower
|
||
+ # shall automatically belong to this class.
|
||
+ if ((is_lower(code_point) or is_upper(code_point))
|
||
+ and not is_alpha(code_point)):
|
||
+ sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ # alpha restriction: “No character specified for the keywords cntrl,
|
||
+ # digit, punct or space shall be specified.”
|
||
+ if (is_alpha(code_point) and is_cntrl(code_point)):
|
||
+ sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ if (is_alpha(code_point) and is_digit(code_point)):
|
||
+ sys.stderr.write('%(sym)s is alpha and digit\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ if (is_alpha(code_point) and is_punct(code_point)):
|
||
+ sys.stderr.write('%(sym)s is alpha and punct\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ if (is_alpha(code_point) and is_space(code_point)):
|
||
+ sys.stderr.write('%(sym)s is alpha and space\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ # space restriction: “No character specified for the keywords upper,
|
||
+ # lower, alpha, digit, graph or xdigit shall be specified.”
|
||
+ # upper, lower, alpha already checked above.
|
||
+ if (is_space(code_point) and is_digit(code_point)):
|
||
+ sys.stderr.write('%(sym)s is space and digit\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ if (is_space(code_point) and is_graph(code_point)):
|
||
+ sys.stderr.write('%(sym)s is space and graph\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ if (is_space(code_point) and is_xdigit(code_point)):
|
||
+ sys.stderr.write('%(sym)s is space and xdigit\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ # cntrl restriction: “No character specified for the keywords upper,
|
||
+ # lower, alpha, digit, punct, graph, print or xdigit shall be
|
||
+ # specified.” upper, lower, alpha already checked above.
|
||
+ if (is_cntrl(code_point) and is_digit(code_point)):
|
||
+ sys.stderr.write('%(sym)s is cntrl and digit\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ if (is_cntrl(code_point) and is_punct(code_point)):
|
||
+ sys.stderr.write('%(sym)s is cntrl and punct\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ if (is_cntrl(code_point) and is_graph(code_point)):
|
||
+ sys.stderr.write('%(sym)s is cntrl and graph\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ if (is_cntrl(code_point) and is_print(code_point)):
|
||
+ sys.stderr.write('%(sym)s is cntrl and print\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ if (is_cntrl(code_point) and is_xdigit(code_point)):
|
||
+ sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ # punct restriction: “No character specified for the keywords upper,
|
||
+ # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
|
||
+ # be specified.” upper, lower, alpha, cntrl already checked above.
|
||
+ if (is_punct(code_point) and is_digit(code_point)):
|
||
+ sys.stderr.write('%(sym)s is punct and digit\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ if (is_punct(code_point) and is_xdigit(code_point)):
|
||
+ sys.stderr.write('%(sym)s is punct and xdigit\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ if (is_punct(code_point) and code_point == 0x0020):
|
||
+ sys.stderr.write('%(sym)s is punct\n' %{
|
||
+ 'sym': ucs_symbol(code_point)})
|
||
+ # graph restriction: “No character specified for the keyword cntrl
|
||
+ # shall be specified.” Already checked above.
|
||
+
|
||
+ # print restriction: “No character specified for the keyword cntrl
|
||
+ # shall be specified.” Already checked above.
|
||
+
|
||
+ # graph - print relation: differ only in the <space> character.
|
||
+ # How is this possible if there are more than one space character?!
|
||
+ # I think susv2/xbd/locale.html should speak of “space characters”,
|
||
+ # not “space character”.
|
||
+ if (is_print(code_point)
|
||
+ and not (is_graph(code_point) or is_space(code_point))):
|
||
+ sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
|
||
+ 'sym': unicode_utils.ucs_symbol(code_point)})
|
||
+ if (not is_print(code_point)
|
||
+ and (is_graph(code_point) or code_point == 0x0020)):
|
||
+ sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
|
||
+ 'sym': unicode_utils.ucs_symbol(code_point)})
|
||
diff --git a/localedata/unicode-gen/utf8_compatibility.py b/localedata/unicode-gen/utf8_compatibility.py
|
||
index b84a1eb..3b7a94c 100755
|
||
--- a/localedata/unicode-gen/utf8_compatibility.py
|
||
+++ b/localedata/unicode-gen/utf8_compatibility.py
|
||
@@ -30,146 +30,7 @@ To see how this script is used, call it with the “-h” option:
|
||
import sys
|
||
import re
|
||
import argparse
|
||
-
|
||
-# Dictionary holding the entire contents of the UnicodeData.txt file
|
||
-#
|
||
-# Contents of this dictionary look like this:
|
||
-#
|
||
-# {0: {'category': 'Cc',
|
||
-# 'title': None,
|
||
-# 'digit': '',
|
||
-# 'name': '<control>',
|
||
-# 'bidi': 'BN',
|
||
-# 'combining': '0',
|
||
-# 'comment': '',
|
||
-# 'oldname': 'NULL',
|
||
-# 'decomposition': '',
|
||
-# 'upper': None,
|
||
-# 'mirrored': 'N',
|
||
-# 'lower': None,
|
||
-# 'decdigit': '',
|
||
-# 'numeric': ''},
|
||
-# …
|
||
-# }
|
||
-UNICODE_ATTRIBUTES = {}
|
||
-
|
||
-# Dictionary holding the entire contents of the EastAsianWidths.txt file
|
||
-#
|
||
-# Contents of this dictionary look like this:
|
||
-#
|
||
-# {0: 'N', … , 45430: 'W', …}
|
||
-EAST_ASIAN_WIDTHS = {}
|
||
-
|
||
-def fill_attribute(code_point, fields):
|
||
- '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
|
||
-
|
||
- One entry in the UNICODE_ATTRIBUTES dictionary represents one line
|
||
- in the UnicodeData.txt file.
|
||
-
|
||
- '''
|
||
- UNICODE_ATTRIBUTES[code_point] = {
|
||
- 'name': fields[1], # Character name
|
||
- 'category': fields[2], # General category
|
||
- 'combining': fields[3], # Canonical combining classes
|
||
- 'bidi': fields[4], # Bidirectional category
|
||
- 'decomposition': fields[5], # Character decomposition mapping
|
||
- 'decdigit': fields[6], # Decimal digit value
|
||
- 'digit': fields[7], # Digit value
|
||
- 'numeric': fields[8], # Numeric value
|
||
- 'mirrored': fields[9], # mirrored
|
||
- 'oldname': fields[10], # Old Unicode 1.0 name
|
||
- 'comment': fields[11], # comment
|
||
- # Uppercase mapping
|
||
- 'upper': int(fields[12], 16) if fields[12] else None,
|
||
- # Lowercase mapping
|
||
- 'lower': int(fields[13], 16) if fields[13] else None,
|
||
- # Titlecase mapping
|
||
- 'title': int(fields[14], 16) if fields[14] else None,
|
||
- }
|
||
-
|
||
-def fill_attributes(filename):
|
||
- '''Stores the entire contents of the UnicodeData.txt file
|
||
- in the UNICODE_ATTRIBUTES dictionary.
|
||
-
|
||
- A typical line for a single code point in UnicodeData.txt looks
|
||
- like this:
|
||
-
|
||
- 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
|
||
-
|
||
- Code point ranges are indicated by pairs of lines like this:
|
||
-
|
||
- 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
|
||
- 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
|
||
- '''
|
||
- with open(filename, mode='r') as unicode_data_file:
|
||
- fields_start = []
|
||
- for line in unicode_data_file:
|
||
- fields = line.strip().split(';')
|
||
- if len(fields) != 15:
|
||
- sys.stderr.write(
|
||
- 'short line in file "%(f)s": %(l)s\n' %{
|
||
- 'f': filename, 'l': line})
|
||
- exit(1)
|
||
- if fields[2] == 'Cs':
|
||
- # Surrogates are UTF-16 artefacts,
|
||
- # not real characters. Ignore them.
|
||
- fields_start = []
|
||
- continue
|
||
- if fields[1].endswith(', First>'):
|
||
- fields_start = fields
|
||
- fields_start[1] = fields_start[1].split(',')[0][1:]
|
||
- continue
|
||
- if fields[1].endswith(', Last>'):
|
||
- fields[1] = fields[1].split(',')[0][1:]
|
||
- if fields[1:] != fields_start[1:]:
|
||
- sys.stderr.write(
|
||
- 'broken code point range in file "%(f)s": %(l)s\n' %{
|
||
- 'f': filename, 'l': line})
|
||
- exit(1)
|
||
- for code_point in range(
|
||
- int(fields_start[0], 16),
|
||
- int(fields[0], 16)+1):
|
||
- fill_attribute(code_point, fields)
|
||
- fields_start = []
|
||
- continue
|
||
- fill_attribute(int(fields[0], 16), fields)
|
||
- fields_start = []
|
||
-
|
||
-def fill_east_asian_widths(filename):
|
||
- '''Stores the entire contents of the EastAsianWidths.txt file
|
||
- in the EAST_ASIAN_WIDTHS dictionary.
|
||
-
|
||
- Lines in EastAsianWidths.txt are either a code point range like
|
||
- this:
|
||
-
|
||
- 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
|
||
-
|
||
- or a single code point like this:
|
||
-
|
||
- A015;W # Lm YI SYLLABLE WU
|
||
- '''
|
||
- with open(filename, mode='r') as east_asian_widths_file:
|
||
- for line in east_asian_widths_file:
|
||
- match = re.match(
|
||
- r'^(?P<codepoint1>[0-9A-F]{4,6})'
|
||
- +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
|
||
- +r'\s*;\s*(?P<property>[a-zA-Z]+)',
|
||
- line)
|
||
- if not match:
|
||
- continue
|
||
- start = match.group('codepoint1')
|
||
- end = match.group('codepoint2')
|
||
- if not end:
|
||
- end = start
|
||
- for code_point in range(int(start, 16), int(end, 16)+1):
|
||
- EAST_ASIAN_WIDTHS[code_point] = match.group('property')
|
||
-
|
||
-def ucs_symbol(code_point):
|
||
- '''Return the UCS symbol string for a Unicode character.'''
|
||
- if code_point < 0x10000:
|
||
- return '<U{:04X}>'.format(code_point)
|
||
- else:
|
||
- return '<U{:08X}>'.format(code_point)
|
||
+import unicode_utils
|
||
|
||
def create_charmap_dictionary(file_name):
|
||
'''Create a dictionary for all code points found in the CHARMAP
|
||
@@ -217,10 +78,10 @@ def check_charmap(original_file_name, new_file_name):
|
||
if ARGS.show_missing_characters:
|
||
for key in sorted(set(ocharmap)-set(ncharmap)):
|
||
print('removed: {:s} {:s} {:s}'.format(
|
||
- ucs_symbol(key),
|
||
+ unicode_utils.ucs_symbol(key),
|
||
ocharmap[key],
|
||
- UNICODE_ATTRIBUTES[key]['name'] \
|
||
- if key in UNICODE_ATTRIBUTES else None))
|
||
+ unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
|
||
+ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
|
||
print('------------------------------------------------------------')
|
||
changed_charmap = {}
|
||
for key in set(ocharmap).intersection(set(ncharmap)):
|
||
@@ -231,21 +92,21 @@ def check_charmap(original_file_name, new_file_name):
|
||
if ARGS.show_changed_characters:
|
||
for key in sorted(changed_charmap):
|
||
print('changed: {:s} {:s}->{:s} {:s}'.format(
|
||
- ucs_symbol(key),
|
||
+ unicode_utils.ucs_symbol(key),
|
||
changed_charmap[key][0],
|
||
changed_charmap[key][1],
|
||
- UNICODE_ATTRIBUTES[key]['name'] \
|
||
- if key in UNICODE_ATTRIBUTES else None))
|
||
+ unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
|
||
+ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
|
||
print('------------------------------------------------------------')
|
||
print('Total added characters in newly generated CHARMAP: %d'
|
||
%len(set(ncharmap)-set(ocharmap)))
|
||
if ARGS.show_added_characters:
|
||
for key in sorted(set(ncharmap)-set(ocharmap)):
|
||
print('added: {:s} {:s} {:s}'.format(
|
||
- ucs_symbol(key),
|
||
+ unicode_utils.ucs_symbol(key),
|
||
ncharmap[key],
|
||
- UNICODE_ATTRIBUTES[key]['name'] \
|
||
- if key in UNICODE_ATTRIBUTES else None))
|
||
+ unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
|
||
+ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
|
||
|
||
def create_width_dictionary(file_name):
|
||
'''Create a dictionary for all code points found in the WIDTH
|
||
@@ -290,20 +151,20 @@ def check_width(original_file_name, new_file_name):
|
||
+ 'i.e. these have width 1 now.)')
|
||
if ARGS.show_missing_characters:
|
||
for key in sorted(set(owidth)-set(nwidth)):
|
||
- print('removed: {:s} '.format(ucs_symbol(key))
|
||
+ print('removed: {:s} '.format(unicode_utils.ucs_symbol(key))
|
||
+ '{:d} : '.format(owidth[key])
|
||
+ 'eaw={:s} '.format(
|
||
- EAST_ASIAN_WIDTHS[key]
|
||
- if key in EAST_ASIAN_WIDTHS else None)
|
||
+ unicode_utils.EAST_ASIAN_WIDTHS[key]
|
||
+ if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
|
||
+ 'category={:2s} '.format(
|
||
- UNICODE_ATTRIBUTES[key]['category']
|
||
- if key in UNICODE_ATTRIBUTES else None)
|
||
+ unicode_utils.UNICODE_ATTRIBUTES[key]['category']
|
||
+ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
|
||
+ 'bidi={:3s} '.format(
|
||
- UNICODE_ATTRIBUTES[key]['bidi']
|
||
- if key in UNICODE_ATTRIBUTES else None)
|
||
+ unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
|
||
+ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
|
||
+ 'name={:s}'.format(
|
||
- UNICODE_ATTRIBUTES[key]['name']
|
||
- if key in UNICODE_ATTRIBUTES else None))
|
||
+ unicode_utils.UNICODE_ATTRIBUTES[key]['name']
|
||
+ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
|
||
print('------------------------------------------------------------')
|
||
changed_width = {}
|
||
for key in set(owidth).intersection(set(nwidth)):
|
||
@@ -313,21 +174,21 @@ def check_width(original_file_name, new_file_name):
|
||
%len(changed_width))
|
||
if ARGS.show_changed_characters:
|
||
for key in sorted(changed_width):
|
||
- print('changed width: {:s} '.format(ucs_symbol(key))
|
||
+ print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key))
|
||
+ '{:d}->{:d} : '.format(changed_width[key][0],
|
||
changed_width[key][1])
|
||
+ 'eaw={:s} '.format(
|
||
- EAST_ASIAN_WIDTHS[key]
|
||
- if key in EAST_ASIAN_WIDTHS else None)
|
||
+ unicode_utils.EAST_ASIAN_WIDTHS[key]
|
||
+ if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
|
||
+ 'category={:2s} '.format(
|
||
- UNICODE_ATTRIBUTES[key]['category']
|
||
- if key in UNICODE_ATTRIBUTES else None)
|
||
+ unicode_utils.UNICODE_ATTRIBUTES[key]['category']
|
||
+ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
|
||
+ 'bidi={:3s} '.format(
|
||
- UNICODE_ATTRIBUTES[key]['bidi']
|
||
- if key in UNICODE_ATTRIBUTES else None)
|
||
+ unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
|
||
+ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
|
||
+ 'name={:s}'.format(
|
||
- UNICODE_ATTRIBUTES[key]['name']
|
||
- if key in UNICODE_ATTRIBUTES else None))
|
||
+ unicode_utils.UNICODE_ATTRIBUTES[key]['name']
|
||
+ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
|
||
print('------------------------------------------------------------')
|
||
print('Total added characters in newly generated WIDTH: %d'
|
||
%len(set(nwidth)-set(owidth)))
|
||
@@ -335,20 +196,20 @@ def check_width(original_file_name, new_file_name):
|
||
+ 'i.e. these had width 1 before.)')
|
||
if ARGS.show_added_characters:
|
||
for key in sorted(set(nwidth)-set(owidth)):
|
||
- print('added: {:s} '.format(ucs_symbol(key))
|
||
+ print('added: {:s} '.format(unicode_utils.ucs_symbol(key))
|
||
+ '{:d} : '.format(nwidth[key])
|
||
+ 'eaw={:s} '.format(
|
||
- EAST_ASIAN_WIDTHS[key]
|
||
- if key in EAST_ASIAN_WIDTHS else None)
|
||
+ unicode_utils.EAST_ASIAN_WIDTHS[key]
|
||
+ if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
|
||
+ 'category={:2s} '.format(
|
||
- UNICODE_ATTRIBUTES[key]['category']
|
||
- if key in UNICODE_ATTRIBUTES else None)
|
||
+ unicode_utils.UNICODE_ATTRIBUTES[key]['category']
|
||
+ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
|
||
+ 'bidi={:3s} '.format(
|
||
- UNICODE_ATTRIBUTES[key]['bidi']
|
||
- if key in UNICODE_ATTRIBUTES else None)
|
||
+ unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
|
||
+ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
|
||
+ 'name={:s}'.format(
|
||
- UNICODE_ATTRIBUTES[key]['name']
|
||
- if key in UNICODE_ATTRIBUTES else None))
|
||
+ unicode_utils.UNICODE_ATTRIBUTES[key]['name']
|
||
+ if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
|
||
|
||
if __name__ == "__main__":
|
||
PARSER = argparse.ArgumentParser(
|
||
@@ -392,8 +253,8 @@ if __name__ == "__main__":
|
||
ARGS = PARSER.parse_args()
|
||
|
||
if ARGS.unicode_data_file:
|
||
- fill_attributes(ARGS.unicode_data_file)
|
||
+ unicode_utils.fill_attributes(ARGS.unicode_data_file)
|
||
if ARGS.east_asian_width_file:
|
||
- fill_east_asian_widths(ARGS.east_asian_width_file)
|
||
+ unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file)
|
||
check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
|
||
check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)
|
||
diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py
|
||
index f1b88f5..bc84c07 100755
|
||
--- a/localedata/unicode-gen/utf8_gen.py
|
||
+++ b/localedata/unicode-gen/utf8_gen.py
|
||
@@ -29,6 +29,7 @@ It will output UTF-8 file
|
||
|
||
import sys
|
||
import re
|
||
+import unicode_utils
|
||
|
||
# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
|
||
# sections 3.11 and 4.4.
|
||
@@ -49,13 +50,6 @@ JAMO_FINAL_SHORT_NAME = (
|
||
'P', 'H'
|
||
)
|
||
|
||
-def ucs_symbol(code_point):
|
||
- '''Return the UCS symbol string for a Unicode character.'''
|
||
- if code_point < 0x10000:
|
||
- return '<U{:04X}>'.format(code_point)
|
||
- else:
|
||
- return '<U{:08X}>'.format(code_point)
|
||
-
|
||
def process_range(start, end, outfile, name):
|
||
'''Writes a range of code points into the CHARMAP section of the
|
||
output file
|
||
@@ -78,7 +72,7 @@ def process_range(start, end, outfile, name):
|
||
+ JAMO_MEDIAL_SHORT_NAME[index2] \
|
||
+ JAMO_FINAL_SHORT_NAME[index3]
|
||
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
|
||
- ucs_symbol(i), convert_to_hex(i),
|
||
+ unicode_utils.ucs_symbol(i), convert_to_hex(i),
|
||
hangul_syllable_name))
|
||
return
|
||
# UnicodeData.txt file has contains code point ranges like this:
|
||
@@ -95,14 +89,14 @@ def process_range(start, end, outfile, name):
|
||
for i in range(int(start, 16), int(end, 16), 64 ):
|
||
if i > (int(end, 16)-64):
|
||
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
|
||
- ucs_symbol(i),
|
||
- ucs_symbol(int(end,16)),
|
||
+ unicode_utils.ucs_symbol(i),
|
||
+ unicode_utils.ucs_symbol(int(end,16)),
|
||
convert_to_hex(i),
|
||
name))
|
||
break
|
||
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
|
||
- ucs_symbol(i),
|
||
- ucs_symbol(i+63),
|
||
+ unicode_utils.ucs_symbol(i),
|
||
+ unicode_utils.ucs_symbol(i+63),
|
||
convert_to_hex(i),
|
||
name))
|
||
|
||
@@ -168,7 +162,7 @@ def process_charmap(flines, outfile):
|
||
# comments, so we keep these comment lines.
|
||
outfile.write('%')
|
||
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
|
||
- ucs_symbol(int(fields[0], 16)),
|
||
+ unicode_utils.ucs_symbol(int(fields[0], 16)),
|
||
convert_to_hex(int(fields[0], 16)),
|
||
fields[1]))
|
||
|
||
@@ -230,7 +224,7 @@ def process_width(outfile, ulines, elines):
|
||
for line in ulines:
|
||
fields = line.split(";")
|
||
if fields[4] == "NSM" or fields[2] == "Cf":
|
||
- width_dict[int(fields[0], 16)] = ucs_symbol(
|
||
+ width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
|
||
int(fields[0], 16)) + '\t0'
|
||
|
||
for line in elines:
|
||
@@ -238,7 +232,7 @@ def process_width(outfile, ulines, elines):
|
||
# UnicodeData.txt:
|
||
fields = line.split(";")
|
||
if not '..' in fields[0]:
|
||
- width_dict[int(fields[0], 16)] = ucs_symbol(
|
||
+ width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
|
||
int(fields[0], 16)) + '\t2'
|
||
else:
|
||
code_points = fields[0].split("..")
|
||
@@ -247,8 +241,8 @@ def process_width(outfile, ulines, elines):
|
||
if key in width_dict:
|
||
del width_dict[key]
|
||
width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format(
|
||
- ucs_symbol(int(code_points[0], 16)),
|
||
- ucs_symbol(int(code_points[1], 16)))
|
||
+ unicode_utils.ucs_symbol(int(code_points[0], 16)),
|
||
+ unicode_utils.ucs_symbol(int(code_points[1], 16)))
|
||
|
||
for key in sorted(width_dict):
|
||
outfile.write(width_dict[key]+'\n')
|
||
--
|
||
2.4.3
|
||
|