xref: /aosp_15_r20/external/cldr/common/transforms/Greek-Latin.xml (revision 912701f9769bb47905792267661f0baf2b85bed5)
1<?xml version="1.0" encoding="UTF-8" ?>
2<!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
3<!--
4Copyright © 1991-2013 Unicode, Inc.
5CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
6For terms of use, see http://www.unicode.org/copyright.html
7-->
8<supplementalData>
9	<version number="$Revision$"/>
10	<transforms>
11		<transform source="Grek" target="Latn" direction="both" alias="Greek-Latin und-Latn-t-und-grek" backwardAlias="Latin-Greek und-Grek-t-und-latn">
12			<tRule><![CDATA[
13# Rules are predicated on running NFD first, and NFC afterwards
14# :: [\u0000-\u007F \u0370-Ͽ [:Greek:] [:nonspacing mark:]] ;
15# MINIMAL FILTER GENERATED FOR: Greek-Latin
16:: [΄´;µ·ÄËÏÖÜäëïöüÿ-āĒ-ēĪ-īŌ-ōŪ-ūŸǕ-ǜǞ-ǣǬ-ǭȪ-ȭȰ-ȳ̄̈̓-̔͂-ͅͺ;Ά-ΊΌΎ-ΡΣ-ώϐ-ϗϛϝϟϡϣϥϧϩϫϭϯ-ϵϷ-\u07FBЁЇёїӒ-ӓӚ-ӟӢ-ӧӪ-ӱӴ-ӵӸ-ӹḔ-ḗḠ-ḡḦ-ḧḮ-ḯḸ-ḹṎ-ṓṜ-ṝṺ-ṻẄ-ẅẌ-ẍẗἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼι῁-ῄῆ-ῌ῏-ΐῖ-Ί῟-Ῥῲ-ῴῶ-ῼΩϹ] ;
17:: NFD (NFC) ;
18# TEST CASES
19# Ὀλίγοι ἔμφονες πολλῶν ἀφρόνων φοβερώτεροι — Πλάτωνος
20# ᾂ ᾒ ᾢ ᾃ ᾓ ᾣ
21# ᾳ ῃ ῳ ὃ ὄ
22# ὠς ὡς ὢς ὣς
23# Ὠς Ὡς Ὢς Ὣς
24# ὨΣ ὩΣ ὪΣ ὫΣ
25# Ạ, ạ, Ẹ, ẹ, Ọ, ọ
26# Useful variables
27$lower = [[:latin:][:greek:] & [:Ll:]];
28$glower = [[:greek:] & [:Ll:]];
29$upper = [[:latin:][:greek:] & [:Lu:]] ;
30$accent = [:M:] ;
31# NOTE: restrict to just the Greek & Latin accents that we care about
32# TODO: broaden out once interation is fixed
33$accentMinus = [ [̀-ͅ] & [:M:] - [̸]] ;
34$macron = ̄ ;
35$ddot = ̈ ;
36$ddotmac = [$ddot$macron];
37$lcgvowel = [αεηιουω] ;
38$ucgvowel = [ΑΕΗΙΟΥΩ] ;
39$gvowel = [$lcgvowel $ucgvowel] ;
40$lcgvowelC = [$lcgvowel $accent] ;
41$evowel = [aeiouyAEIOUY];
42$evowel2 = [iuyIUY];
43$vowel = [ $evowel $gvowel] ;
44$gammaLike = [ΓΚΞΧγκξχϰ] ;
45$egammaLike = [GKXCgkxc] ;
46$smooth = ̓ ;
47$rough = ̔ ;
48$iotasub = ͅ ;
49$evowel_i = [$evowel-[iI]] ;
50$evowel2_i = [uyUY];
51$underbar = ̱;
52$afterLetter = [:L:] [[:M:]\']* ;
53$beforeLetter = [[:M:]\']* [:L:] ;
54$beforeLower = $accent * $lower ;
55$notLetter = [^[:L:][:M:]] ;
56$under = ̱;
57# Fix punctuation
58# preserve original
59
60\: ↔ \: $under ;
61\? ↔ \? $under ;
62\; ↔ \? ;
63· ↔ \: ;
64΄ ↔ ´;
65# CIRCUMFLEX: convert greek circumflex to normal one. Could use tilde or inverted breve
66͂ ↔ ̂ ;
67# IOTA: convert iota subscript to iota
68# first make previous alpha long!
69$accent_minus = [[$accent]-[$iotasub$macron]];
70Α } $accent_minus * $iotasub → | Α $macron ;
71α } $accent_minus * $iotasub → | α $macron ;
72# now convert to uppercase if after uppercase, ow to lowercase
73$upper $accent * { $iotasub → I ;
74$iotasub → i ;
75| $1 $iotasub ← ($evowel $macron $accentMinus *) i ;
76| $1 $iotasub ← ($evowel $macron $accentMinus *) I ;
77# BREATHING
78# Convert rough breathing to h, and move before letters.
79# Make A ` x = → H a x
80Α ($macron?) $rough } $beforeLower → H | α $1;
81Ε $rough } $beforeLower → H | ε;
82Η $rough } $beforeLower → H | η ;
83Ι ($ddot?) $rough } $beforeLower → H | ι  $1;
84Ο $rough } $beforeLower → H | ο ;
85Υ $rough } $beforeLower → H | υ ;
86Ω ($ddot?) $rough } $beforeLower → H | ω $1;
87# Make A x ` = → H a x
88Α ($glower $macron?) $rough → H | α $1 ;
89Ε ($glower) $rough → H | ε $1 ;
90Η ($glower) $rough → H | η $1 ;
91Ι ($glower $ddot?) $rough → H | ι $1 ;
92Ο ($glower) $rough → H | ο $1 ;
93Υ ($glower) $rough → H | υ $1 ;
94Ω ($glower  $ddot?) $rough → H | ω $1 ;
95#Otherwise, make x ` into h x and X ` into H X
96($lcgvowel + $ddotmac? ) $rough → h | $1 ;
97($gvowel + $ddotmac? ) $rough → H | $1 ;
98# Go backwards with H
99| $1 $rough ← h ($evowel $macron $ddot? $evowel2_i $macron?) ;
100| $1 $rough ← h ($evowel $ddot? $evowel2 $macron?) ;
101| $1 $rough ← h ($evowel $macron? $ddot?) ;
102| $1 $rough ← H ([AEIOUY] $macron $ddot? $evowel2_i $macron?) ;
103| $1 $rough ← H ([AEIOUY] $ddot? $evowel2 $macron?) ;
104| $1 $rough ← H ([AEIOUY] $macron? $ddot?) ;
105# titlecase, have to fix individually
106# in the future, we should add &uppercase() to make this easier
107| A $1 $rough ← H a ($macron  $ddot? $evowel2_i $macron?) ;
108| E $1 $rough ← H e ($macron  $ddot? $evowel2_i $macron?) ;
109| I $1 $rough ← H i ($macron  $ddot? $evowel2_i $macron?) ;
110| O $1 $rough ← H o ($macron  $ddot? $evowel2_i $macron?) ;
111| U $1 $rough ← H u ($macron $ddot? $evowel2_i $macron?) ;
112| Y $1 $rough ← H y ($macron $ddot? $evowel2_i $macron?) ;
113| A $1 $rough ← H a ($ddot? $evowel2 $macron?) ;
114| E $1 $rough ← H e ($ddot? $evowel2 $macron?) ;
115| I $1 $rough ← H i ($ddot? $evowel2 $macron?) ;
116| O $1 $rough ← H o ($ddot? $evowel2 $macron?) ;
117| U $1 $rough ← H u ($ddot? $evowel2 $macron?) ;
118| Y $1 $rough ← H y ($ddot? $evowel2 $macron?) ;
119| A $1 $rough ← H a ($macron? $ddot? ) ;
120| E $1 $rough ← H e ($macron? $ddot? ) ;
121| I $1 $rough ← H i ($macron? $ddot? ) ;
122| O $1 $rough ← H o ($macron? $ddot? ) ;
123| U $1 $rough ← H u ($macron? $ddot? ) ;
124| Y $1 $rough ← H y ($macron? $ddot? ) ;
125# Now do smooth
126#delete smooth breathing for Latin
127$smooth → ;
128# insert in Greek
129# the assumption is that all Marks are on letters.
130| $1 $smooth ← $notLetter { ([rR]) } [^hH$smooth$rough] ;
131| $1 $smooth ← $notLetter { ($evowel $macron? $evowel2 $macron?) } [^$smooth$rough] ;
132| $1 $smooth ← $notLetter { ($evowel $macron?) } [^$evowel2$smooth$rough] ;
133# TODO: preserve smooth/rough breathing if not
134# on initial vowel sequence
135# need to have these up here so the rules don't mask
136# remove now superfluous macron when returning
137Α ← A $macron ;
138α ← a $macron ;
139η ↔ e $macron ;
140Η ↔ E $macron ;
141φ ↔ ph ;
142Ψ } $beforeLower ↔ Ps ;
143Ψ ↔ PS ;
144Φ } $beforeLower ↔ Ph ;
145Φ ↔ PH ;
146ψ ↔ ps ;
147ω ↔ o $macron ;
148Ω ↔  O $macron;
149# NORMAL
150α ↔ a ;
151Α ↔ A ;
152β ↔ b ;
153Β ↔ B ;
154γ } $gammaLike ↔ n } $egammaLike ;
155γ ↔ g ;
156Γ } $gammaLike ↔ N } $egammaLike ;
157Γ ↔ G ;
158δ ↔ d ;
159Δ ↔ D ;
160ε ↔ e ;
161Ε ↔ E ;
162ζ ↔ z ;
163Ζ ↔ Z ;
164θ ↔ th ;
165Θ } $beforeLower ↔ Th ;
166Θ ↔ TH ;
167ι ↔ i ;
168Ι ↔ I ;
169κ ↔ k ;
170Κ ↔ K ;
171λ ↔ l ;
172Λ ↔ L ;
173μ ↔ m ;
174Μ ↔ M ;
175ν } $gammaLike → n\' ;
176ν ↔ n ;
177Ν } $gammaLike ↔ N\' ;
178Ν ↔ N ;
179ξ ↔ x ;
180Ξ ↔ X ;
181ο ↔ o ;
182Ο ↔ O ;
183π ↔ p ;
184Π ↔ P ;
185ρ $rough ↔ rh;
186Ρ $rough } $beforeLower ↔ Rh ;
187Ρ $rough ↔ RH ;
188ρ ↔ r ;
189Ρ ↔ R ;
190# insert separator before things that turn into s
191[Pp] { } [ςσΣϷϸϺϻ] → \' ;
192# special S variants
193Ϸ ↔ Š ; # Ϸ GREEK CAPITAL LETTER SHO Uppercase_Letter Grek - L
194ϸ ↔ š ; #ϸ GREEK SMALL LETTER SHO Lowercase_Letter Grek - L
195Ϻ ↔ Ŝ ; # Ϻ GREEK CAPITAL LETTER SAN Uppercase_Letter Grek - L
196ϻ ↔ ŝ ; # ϻ GREEK SMALL LETTER SAN Lowercase_Letter Grek - L
197# underbar means exception
198# before a letter, initial
199ς } $beforeLetter ↔ s $underbar } $beforeLetter;
200σ } $beforeLetter ↔ s } $beforeLetter;
201# otherwise, after a letter = final
202$afterLetter { σ ↔ $afterLetter { s $underbar;
203$afterLetter { ς ↔ $afterLetter { s ;
204# otherwise (isolated) = initial
205ς ↔ s $underbar;
206σ ↔ s ;
207# [Pp] { Σ ↔ \'S ;
208Σ ↔ S ;
209τ ↔ t ;
210Τ ↔ T ;
211$vowel {υ } ↔ u ;
212υ ↔ y ;
213$vowel { Υ ↔ U ;
214Υ ↔ Y ;
215χ ↔ ch ;
216Χ } $beforeLower ↔ Ch ;
217Χ ↔ CH ;
218# Completeness for ASCII
219$ignore = [[:Mark:]''] * ;
220| k  ← c ;
221| ph ← f ;
222| i  ← j ;
223| k ← q ;
224| b ← v } $vowel ;
225| b ← w } $vowel;
226| u ← v ;
227| u ← w;
228| K ← C ;
229| Ph ← F ;
230| I ← J ;
231| K ← Q ;
232| B ← V  } $vowel ;
233| B ← W  } $vowel ;
234| U ← V ;
235| U ← W ;
236$rough } $ignore [:UppercaseLetter:] → H ;
237$ignore [:UppercaseLetter:] { $rough → H ;
238$rough ← H ;
239$rough ↔ h ;
240# Completeness for Greek
241ϐ → | β ;
242ϑ → | θ ;
243ϒ → | Υ ;
244ϕ → | φ ;
245ϖ → | π ;
246ϰ → | κ ;
247ϱ → | ρ ;
248ϲ → | σ ;
249Ϲ → | Σ; #U+03F9 GREEK CAPITAL LUNATE SIGMA SYMBOL
250ϳ → j ;
251ϴ → | Θ ;
252ϵ → | ε ;
253µ → | μ ;
254ͺ → i;
255# delete any trailing ' marks used for roundtripping
256← [Ππ] { \' } [Ss] ;
257← [Νν] { \' } $egammaLike ;
258
259::NFC (NFD) ;
260# ([\u0000-\u007F [:Latin:] [:Greek:] [:nonspacing mark:]]) ;
261# ([\u0000-\u007F · [:Latin:] [:nonspacing mark:]]) ;
262# MINIMAL FILTER GENERATED FOR: Latin-Greek BACKWARD
263:: ( [':?A-Za-zÀ-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳ̀-̷̹-ͅ΅-ΆΈ-ΊΌΎ-ΐΪ-ΰϊ-ώϓ-ϔЀ-ЁЃЇЌ-ЎЙйѐ-ёѓїќ-ўѶ-ѷӁ-ӂӐ-ӓӖ-ӗӚ-ӟӢ-ӧӪ-ӵӸ-ӹḀ-ẙẛẠ-ỹἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼ῁-ῄῆ-ΐῖ-Ί῝-΅ῲ-ῴῶ-ῼK-Å] ) ;
264			]]></tRule>
265		</transform>
266	</transforms>
267</supplementalData>
268