xref: /aosp_15_r20/external/cldr/common/transforms/Han-Spacedhan.xml (revision 912701f9769bb47905792267661f0baf2b85bed5)
1<?xml version="1.0" encoding="UTF-8" ?>
2<!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
3<!--
4Copyright © 1991-2013 Unicode, Inc.
5CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
6For terms of use, see http://www.unicode.org/copyright.html
7-->
8<supplementalData>
9	<version number="$Revision$"/>
10	<transforms>
11		<transform source="Han" target="Spacedhan" direction="both" visibility="internal">
12			<tRule>
13# Only intended for internal use
14# Make sure Han are normalized, including characters that contain them.
15# The first set in the filter is computed with http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:tonfkd:/XXX/:]-[:ideographic:]-[:sc=han:]
16# Where XXX is the resolved [:ideographic:][:sc=han:]. It needs updating with each Unicode release!
17:: [[、。々《-』〜・㆒-㆟㈠-㉇㊀-㊰㋀-㋋ ㍘-㍰㍻-㍿㏠-㏾��-����-����-������][:ideographic:][:sc=han:]] nfkc;
18:: fullwidth-halfwidth;
19。 → '.';
20。→ '.';
21、→ ',';
22、→ ',';
23《→ '«';
24》→ '»';
25〈 → '‹';
26 〉→ '›';
27「→ '‘';
28」→ '’';
29「→ '‘';
30」→ '’';
31『→ '“';
32』→ '”';
33
34・→ '‧';
35・ → '‧';
36々→ '⓶';
37〜→ '~';
38
39$terminalPunct = [\.\,\:\;\?\!.,:?!。、;[:Pe:][:Pf:]];
40$initialPunct = [:Ps:][:Pi:];
41# add space between any Han or terminal punctuation and letters, and
42# between letters and Han or initial punct
43[[:Ideographic:] $terminalPunct] {} [:Letter:] → ' ' ;
44[:Letter:] [:Mark:]* {} [[:Ideographic:] $initialPunct] → ' ' ;
45# remove spacing between ideographs and other letters
46← [:Ideographic:] { ' ' } [:Letter:] ;
47← [:Letter:] [:Mark:]* { ' ' } [:Ideographic:] ;
48			</tRule>
49		</transform>
50	</transforms>
51</supplementalData>
52