xref: /aosp_15_r20/external/cldr/common/transforms/Arabic-Latin.xml (revision 912701f9769bb47905792267661f0baf2b85bed5)
1<?xml version="1.0" encoding="UTF-8" ?>
2<!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
3<!--
4Copyright © 1991-2013 Unicode, Inc.
5CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
6For terms of use, see http://www.unicode.org/copyright.html
7-->
8<supplementalData>
9	<version number="$Revision$"/>
10	<transforms>
11		<transform source="Arab" target="Latn" direction="both" alias="Arabic-Latin und-Latn-t-und-arab" backwardAlias="Latin-Arabic und-Arab-t-und-latn">
12			<tRule><![CDATA[
13# Generally follows UNGEGN
14#     http://www.eki.ee/wgrs/rom1_ar.pdf
15# Occasionally deviates in the direction of ISO 233
16#     http://homepage.mac.com/sirbinks/pdf/Arabic.pdf
17# a) where required for disambiguation.
18# b) with underdot instead of cedilla for letter like SAD,
19#    since those are explicitly in Unicode for transliteration.
20# c) with extra non-Arabic-language letters, like PEH
21#
22# Does *not* do assimilation of "al", nor hyphenation.
23# While it could be done, we need to determine whether a prefix "al" could
24# occur other than as the definite article (since no space is used).
25:: [[:Arabic:][:block=ARABIC:][‎ⁿ،؛؟ـً-ٕ٠-٬۰-۹﷼ښ]] ;
26:: NFKD (NFC);
27$disambig =  ̱ ;
28$disambig2 =  ̰ ;
29$under =  ̣ ;
30$descender = ˌ;
31$notAbove = [[:^ccc=0:] & [:^ccc=230:]];
32
33# non-letters
34[:Nd:]{٫}[:Nd:] ↔ [:Nd:]{','}[:Nd:] ; # ARABIC DECIMAL SEPARATOR
35[:Nd:]{٬}[:Nd:] ↔ [:Nd:]{'.'}[:Nd:] ; # ARABIC THOUSANDS SEPARATOR
36٫ ↔ ',' $disambig ; # ARABIC DECIMAL SEPARATOR
37٬ ↔ '.' $disambig ; # ARABIC THOUSANDS SEPARATOR
38#  ٭ ↔ ; # ARABIC FIVE POINTED STAR // no need to transliterate
39، ↔ ',' ; # ARABIC COMMA
40؛ ↔ ';' ; # ARABIC SEMICOLON
41؟ ↔ '?' ; # ARABIC QUESTION MARK
42٪ ↔ '%' ; # ARABIC PERCENT SIGN
43۰ ↔ 0 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ZERO
44۱ ↔ 1 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ONE
45۲ ↔ 2 $disambig ; # EXTENDED ARABIC-INDIC DIGIT TWO
46۳ ↔ 3 $disambig ; # EXTENDED ARABIC-INDIC DIGIT THREE
47۴ ↔ 4 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FOUR
48۵ ↔ 5 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FIVE
49۶ ↔ 6 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SIX
50۷ ↔ 7 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SEVEN
51۸ ↔ 8 $disambig ; # EXTENDED ARABIC-INDIC DIGIT EIGHT
52۹ ↔ 9 $disambig ; # EXTENDED ARABIC-INDIC DIGIT NINE
53٠ ↔ 0 ; # ARABIC-INDIC DIGIT ZERO
54١ ↔ 1 ; # ARABIC-INDIC DIGIT ONE
55٢ ↔ 2 ; # ARABIC-INDIC DIGIT TWO
56٣ ↔ 3 ; # ARABIC-INDIC DIGIT THREE
57٤ ↔ 4 ; # ARABIC-INDIC DIGIT FOUR
58٥ ↔ 5 ; # ARABIC-INDIC DIGIT FIVE
59٦ ↔ 6 ; # ARABIC-INDIC DIGIT SIX
60٧ ↔ 7 ; # ARABIC-INDIC DIGIT SEVEN
61٨ ↔ 8 ; # ARABIC-INDIC DIGIT EIGHT
62٩ ↔ 9 ; # ARABIC-INDIC DIGIT NINE
63
64؉ ↔ ‰ ;	# U+0609	ARABIC-INDIC PER MILLE SIGN
65؊ ↔ ‱ ;	 # U+060A	ARABIC-INDIC PER TEN THOUSAND SIGN
66‎۔‎ ↔ '.' ; 	# U+06D4	ARABIC FULL STOP
67
68# letters
69# long vowels
70َا↔ ā ; # ARABIC FATHA, ARABIC LETTER ALEF
71ُو ↔ ū ; # ARABIC DAMMA, ARABIC LETTER WAW
72ِي ↔ ī ; # ARABIC KASRA, ARABIC LETTER YEH
73# longer items moved here to prevent masking
74ث ↔ t h $disambig ; # ARABIC LETTER THEH
75ذ ↔ d h $disambig ; # ARABIC LETTER THAL
76ش ↔ s h $disambig ; # ARABIC LETTER SHEEN
77ص ↔ s $under ; # ARABIC LETTER SAD
78ض ↔ d $under ; # ARABIC LETTER DAD
79ط ↔ t $under ; # ARABIC LETTER TAH
80ظ ↔ z $under ; # ARABIC LETTER ZAH
81غ ↔ g h $disambig ; # ARABIC LETTER GHAIN
82
83# WARNING: special case
84# ←t, umlaut, half-ring below→ will be canonically ordered as ←t, half-ring below, umlaut→
85# so on the return, we have to skip over (but preserve) the half-ring below (or others like it)
86# ةٕ ← ẗ̹ ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS
87ة ↔ t ̈ ; # ARABIC LETTER TEH MARBUTA
88ة | $1 ← t ($notAbove+) ̈ ; # ARABIC LETTER TEH MARBUTA
89
90# non-Arabic language
91ژ ↔ z h $disambig ; # ARABIC LETTER JEH
92ڭ ↔ n $disambig g ; # ARABIC LETTER NG
93ۋ ↔ v $disambig ; # ARABIC LETTER VE
94ی ↔ y $disambig2 ; # ARABIC LETTER FARSI YEH
95ښ ↔ s $descender;
96
97# Arabic language
98ء ↔ ʾ ; # ARABIC LETTER HAMZA
99ا ↔ a $under; # ARABIC LETTER ALEF
100ب ↔ b ; # ARABIC LETTER BEH
101ت ↔ t ; # ARABIC LETTER TEH
102ج ↔ j ; # ARABIC LETTER JEEM
103ح ↔ h $under ; # ARABIC LETTER HAH
104خ ↔ k h $disambig ; # ARABIC LETTER KHAH
105د ↔ d ; # ARABIC LETTER DAL
106ر ↔ r ; # ARABIC LETTER REH
107ز ↔ z ; # ARABIC LETTER ZAIN
108س ↔ s ; # ARABIC LETTER SEEN
109ع ↔ ʿ ; # ARABIC LETTER AIN
110ـ → ; # ARABIC TATWEEL
111ف ↔ f ; # ARABIC LETTER FEH
112ق ↔ q ; # ARABIC LETTER QAF
113ک ↔ k $disambig ; # ARABIC LETTER KEHEH
114ك ↔ k ; # ARABIC LETTER KAF
115ل ↔ l ; # ARABIC LETTER LAM
116م ↔ m ; # ARABIC LETTER MEEM
117ن ↔ n ; # ARABIC LETTER NOON
118ه ↔ h ; # ARABIC LETTER HEH
119و ↔ w ; # ARABIC LETTER WAW
120ى ↔ y $disambig ; # ARABIC LETTER ALEF MAKSURA
121ي ↔ y ; # ARABIC LETTER YEH
122ً ↔ aⁿ ; # ARABIC FATHATAN
123ٌ ↔ uⁿ ; # ARABIC DAMMATAN
124ٍ ↔ iⁿ ; # ARABIC KASRATAN
125َ ↔ a ; # ARABIC FATHA
126ُ ↔ u ; # ARABIC DAMMA
127ِ ↔ i ; # ARABIC KASRA
128ّ ↔   ̃ ; # ARABIC SHADDA
129ْ ↔   ̊ ; # ARABIC SUKUN
130
131# special combining marks
132ٓ ↔  ̂ ; # ARABIC MADDAH ABOVE
133ٔ ↔  ̉ ; # ARABIC HAMZA ABOVE
134ٕ ↔  ̹ ; # ARABIC HAMZA BELOW
135
136# Some non-Arabic language (not in UNGEGN)
137پ ↔ p ; # ARABIC LETTER PEH
138چ ↔ c h $disambig ; # ARABIC LETTER TCHEH
139ڤ ↔ v ; # ARABIC LETTER VEH
140# ڥ ↔ v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW
141# ڢ ↔ f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW
142گ ↔ g ; # ARABIC LETTER GAF
143
144# fallbacks TODO roundtrip where possible, using diacritics to distinguish
145#https://en.wikipedia.org/wiki/Sindhi_transliteration
146‎ٺ‎→ṭh;
147‎ٿ‎→th;
148‎ٽ‎→ṭ;
149‎ڙ‎→ṛ;
150‎ڦ‎→ph;
151‎ڻ‎→ṇ;
152‎ڱ‎→ṅ;
153‎ڃ‎→ñ;
154‎ڪ‎→k;
155‎ڄ‎→j̈;
156‎ۃ‎→ẖ;
157‎ڳ‎→g̤;
158‎ڍ‎→ḍh;
159‎ڌ‎→dh;
160‎ڏ‎→d̤;
161‎ڊ‎→ḍ;
162‎ڇ‎→ch;
163‎ڀ‎→bh;
164‎ٻ‎→ḇ;
165‎۽‎→'&';
166‎۾‎→'mn';
167
168#https://en.wiktionary.org/wiki/Wiktionary:Urdu_transliteration
169‎ھ‎ → ʱ ;
170‎ں‎ → ◌̃ ;
171‎ے‎ → ai ;
172‎ڈ‎ → ḍ ;
173‎ڑ‎ → ṛ ;
174‎ٹ‎ → ṭ ;
175
176#https://www.eki.ee/wgrs/rom2_ps.htm
177#https://en.wikipedia.org/wiki/Pashto_alphabet
178‎ټ‎ → ṯ ;
179‎ځ‎ → dz ;
180‎څ‎ → ts ;
181‎ډ‎ → ḏ ;
182‎ړ‎ → ṟ ;
183‎ږ‎ → z͟h ;
184‎ګ‎ → g ;
185‎ڼ‎ → ṉ ;
186‎ۍ‎ → ạy ;
187‎ې‎ → e ;
188
189#https://www.eki.ee/wgrs/rom1_ug.pdf
190‎ہ‎ → ḥ ;
191‎ە‎ → ĥ ;
192
193# fallbacks
194| s ← c } [eiy];
195| k ← c ;
196| i ← e ;
197| u ← o ;
198| ks ← x ;
199| n ← ‎ⁿ;
200:: (lower) ;
201::NFC (NFD);
202:: ( [[:Latin:] [%,.0-9;?ʾ-ʿ̂-̄̈-̣̰̊-̱̹;ˌ]] );
203			]]></tRule>
204		</transform>
205	</transforms>
206</supplementalData>
207