xref: /aosp_15_r20/external/cronet/third_party/icu/patches/wordbrk.patch (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1diff --git a/source/data/brkitr/rules/word.txt b/source/data/brkitr/rules/word.txt
2index e9420c8c..b4603823 100644
3--- a/source/data/brkitr/rules/word.txt
4+++ b/source/data/brkitr/rules/word.txt
5@@ -38,12 +38,34 @@ $Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
6 $Format             = [\p{Word_Break = Format}];
7 $Katakana           = [\p{Word_Break = Katakana}];
8 $Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
9+
10+# Exclude '@' (commercial at, \u0040) from ALetter to maintain breaking at '@'.
11+# ICU 49d192fefe09, in ICU 72, stopped breaking at '@' in order to not break up
12+# e-mail addresses (https://unicode-org.atlassian.net/browse/CLDR-15767). In
13+# light of the Chromium-specific change below that breaks on full-stop (period,
14+# dot, \u002e below in MidNumLet), e-mail addresses will be broken in any case.
15+# Thus, although the upstream intent was to not break "[email protected]" at
16+# all, it actually would break down into {"user", ".", "name@example", ".",
17+# "com"}, which is undesirable. See https://crbug.com/1410331. Maintain the
18+# previous Chromium behavior of breaking at both '@' and '.'.
19+#
20+# TODO: Determine whether it's feasible to drop the Chromium-specific behaviors
21+# (and thus this patch) for '.' and now '@'.
22 $ALetter            = [\p{Word_Break = ALetter}];
23+
24 $Single_Quote       = [\p{Word_Break = Single_Quote}];
25 $Double_Quote       = [\p{Word_Break = Double_Quote}];
26-$MidNumLet          = [\p{Word_Break = MidNumLet}];
27+
28+# Remove two full stop characters from $MidNumLet and add them to $MidNum
29+# to break a hostname into its components at the cost of breaking
30+# 'e.g.' and 'i.e.' as well.
31+# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.
32+# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected
33+# while rules 6/7 are reverted to the old behavior we want.
34+$MidNumLet          = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];
35 $MidLetter          = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]];
36-$MidNum             = [\p{Word_Break = MidNum}];
37+$MidNum             = [\p{Word_Break = MidNum}[\u002E \uFF0E]];
38+
39 $Numeric            = [\p{Word_Break = Numeric}];
40 $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
41 $WSegSpace          = [\p{Word_Break = WSegSpace}];
42diff --git a/source/data/brkitr/rules/word_POSIX.txt b/source/data/brkitr/rules/word_POSIX.txt
43index 3cd0556e..8e63ee4c 100644
44--- a/source/data/brkitr/rules/word_POSIX.txt
45+++ b/source/data/brkitr/rules/word_POSIX.txt
46@@ -38,12 +38,29 @@ $Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
47 $Format             = [\p{Word_Break = Format}];
48 $Katakana           = [\p{Word_Break = Katakana}];
49 $Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
50+
51+# Exclude '@' (commercial at, \u0040) from ALetter to maintain breaking at '@'.
52+# ICU 49d192fefe09, in ICU 72, stopped breaking at '@' in order to not break up
53+# e-mail addresses (https://unicode-org.atlassian.net/browse/CLDR-15767). In
54+# light of the Chromium-specific change below that breaks on full-stop (period,
55+# dot, \u002e below in MidNumLet), e-mail addresses will be broken in any case.
56+# Thus, although the upstream intent was to not break "[email protected]" at
57+# all, it actually would break down into {"user", ".", "name@example", ".",
58+# "com"}, which is undesirable. See https://crbug.com/1410331. Maintain the
59+# previous Chromium behavior of breaking at both '@' and '.'.
60+#
61+# TODO: Determine whether it's feasible to drop the Chromium-specific behaviors
62+# (and thus this patch) for '.' and now '@'.
63 $ALetter            = [\p{Word_Break = ALetter}];
64+
65 $Single_Quote       = [\p{Word_Break = Single_Quote}];
66 $Double_Quote       = [\p{Word_Break = Double_Quote}];
67-$MidNumLet          = [\p{Word_Break = MidNumLet} - [.]];
68+# Remove full-width full stop (\uff0e) from $MidNumLet and add it to $MidNum, in
69+# addition to the ordinary full stop (dot, period, '.', \u002e).
70+$MidNumLet          = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];
71 $MidLetter          = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]];
72-$MidNum             = [\p{Word_Break = MidNum} [.]];
73+$MidNum             = [\p{Word_Break = MidNum}[\u002E \uFF0E]];
74+
75 $Numeric            = [\p{Word_Break = Numeric}];
76 $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
77 $WSegSpace          = [\p{Word_Break = WSegSpace}];
78diff --git a/source/data/brkitr/rules/word_fi_sv.txt b/source/data/brkitr/rules/word_fi_sv.txt
79index daf5b355..ca2decfc 100644
80--- a/source/data/brkitr/rules/word_fi_sv.txt
81+++ b/source/data/brkitr/rules/word_fi_sv.txt
82@@ -38,12 +38,33 @@ $Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
83 $Format             = [\p{Word_Break = Format}];
84 $Katakana           = [\p{Word_Break = Katakana}];
85 $Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
86+
87+# Exclude '@' (commercial at, \u0040) from ALetter to maintain breaking at '@'.
88+# ICU 49d192fefe09, in ICU 72, stopped breaking at '@' in order to not break up
89+# e-mail addresses (https://unicode-org.atlassian.net/browse/CLDR-15767). In
90+# light of the Chromium-specific change below that breaks on full-stop (period,
91+# dot, \u002e below in MidNumLet), e-mail addresses will be broken in any case.
92+# Thus, although the upstream intent was to not break "[email protected]" at
93+# all, it actually would break down into {"user", ".", "name@example", ".",
94+# "com"}, which is undesirable. See https://crbug.com/1410331. Maintain the
95+# previous Chromium behavior of breaking at both '@' and '.'.
96+#
97+# TODO: Determine whether it's feasible to drop the Chromium-specific behaviors
98+# (and thus this patch) for '.' and now '@'.
99 $ALetter            = [\p{Word_Break = ALetter}];
100+
101 $Single_Quote       = [\p{Word_Break = Single_Quote}];
102 $Double_Quote       = [\p{Word_Break = Double_Quote}];
103-$MidNumLet          = [\p{Word_Break = MidNumLet}];
104+# Remove two full stop characters from $MidNumLet and add them to $MidNum
105+# to break a hostname into its components at the cost of breaking
106+# 'e.g.' and 'i.e.' as well.
107+# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.
108+# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected
109+# while rules 6/7 are reverted to the old behavior we want.
110+$MidNumLet          = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];
111 $MidLetter          = [\p{Word_Break = MidLetter}];
112-$MidNum             = [\p{Word_Break = MidNum}];
113+$MidNum             = [\p{Word_Break = MidNum}[\u002E \uFF0E]];
114+
115 $Numeric            = [\p{Word_Break = Numeric}];
116 $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
117 $WSegSpace          = [\p{Word_Break = WSegSpace}];
118