1diff --git a/source/data/brkitr/rules/word.txt b/source/data/brkitr/rules/word.txt 2index e9420c8c..b4603823 100644 3--- a/source/data/brkitr/rules/word.txt 4+++ b/source/data/brkitr/rules/word.txt 5@@ -38,12 +38,34 @@ $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; 6 $Format = [\p{Word_Break = Format}]; 7 $Katakana = [\p{Word_Break = Katakana}]; 8 $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; 9+ 10+# Exclude '@' (commercial at, \u0040) from ALetter to maintain breaking at '@'. 11+# ICU 49d192fefe09, in ICU 72, stopped breaking at '@' in order to not break up 12+# e-mail addresses (https://unicode-org.atlassian.net/browse/CLDR-15767). In 13+# light of the Chromium-specific change below that breaks on full-stop (period, 14+# dot, \u002e below in MidNumLet), e-mail addresses will be broken in any case. 15+# Thus, although the upstream intent was to not break "[email protected]" at 16+# all, it actually would break down into {"user", ".", "name@example", ".", 17+# "com"}, which is undesirable. See https://crbug.com/1410331. Maintain the 18+# previous Chromium behavior of breaking at both '@' and '.'. 19+# 20+# TODO: Determine whether it's feasible to drop the Chromium-specific behaviors 21+# (and thus this patch) for '.' and now '@'. 22 $ALetter = [\p{Word_Break = ALetter}]; 23+ 24 $Single_Quote = [\p{Word_Break = Single_Quote}]; 25 $Double_Quote = [\p{Word_Break = Double_Quote}]; 26-$MidNumLet = [\p{Word_Break = MidNumLet}]; 27+ 28+# Remove two full stop characters from $MidNumLet and add them to $MidNum 29+# to break a hostname into its components at the cost of breaking 30+# 'e.g.' and 'i.e.' as well. 31+# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. 32+# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected 33+# while rules 6/7 are reverted to the old behavior we want. 34+$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; 35 $MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]; 36-$MidNum = [\p{Word_Break = MidNum}]; 37+$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; 38+ 39 $Numeric = [\p{Word_Break = Numeric}]; 40 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 41 $WSegSpace = [\p{Word_Break = WSegSpace}]; 42diff --git a/source/data/brkitr/rules/word_POSIX.txt b/source/data/brkitr/rules/word_POSIX.txt 43index 3cd0556e..8e63ee4c 100644 44--- a/source/data/brkitr/rules/word_POSIX.txt 45+++ b/source/data/brkitr/rules/word_POSIX.txt 46@@ -38,12 +38,29 @@ $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; 47 $Format = [\p{Word_Break = Format}]; 48 $Katakana = [\p{Word_Break = Katakana}]; 49 $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; 50+ 51+# Exclude '@' (commercial at, \u0040) from ALetter to maintain breaking at '@'. 52+# ICU 49d192fefe09, in ICU 72, stopped breaking at '@' in order to not break up 53+# e-mail addresses (https://unicode-org.atlassian.net/browse/CLDR-15767). In 54+# light of the Chromium-specific change below that breaks on full-stop (period, 55+# dot, \u002e below in MidNumLet), e-mail addresses will be broken in any case. 56+# Thus, although the upstream intent was to not break "[email protected]" at 57+# all, it actually would break down into {"user", ".", "name@example", ".", 58+# "com"}, which is undesirable. See https://crbug.com/1410331. Maintain the 59+# previous Chromium behavior of breaking at both '@' and '.'. 60+# 61+# TODO: Determine whether it's feasible to drop the Chromium-specific behaviors 62+# (and thus this patch) for '.' and now '@'. 63 $ALetter = [\p{Word_Break = ALetter}]; 64+ 65 $Single_Quote = [\p{Word_Break = Single_Quote}]; 66 $Double_Quote = [\p{Word_Break = Double_Quote}]; 67-$MidNumLet = [\p{Word_Break = MidNumLet} - [.]]; 68+# Remove full-width full stop (\uff0e) from $MidNumLet and add it to $MidNum, in 69+# addition to the ordinary full stop (dot, period, '.', \u002e). 70+$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; 71 $MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]; 72-$MidNum = [\p{Word_Break = MidNum} [.]]; 73+$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; 74+ 75 $Numeric = [\p{Word_Break = Numeric}]; 76 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 77 $WSegSpace = [\p{Word_Break = WSegSpace}]; 78diff --git a/source/data/brkitr/rules/word_fi_sv.txt b/source/data/brkitr/rules/word_fi_sv.txt 79index daf5b355..ca2decfc 100644 80--- a/source/data/brkitr/rules/word_fi_sv.txt 81+++ b/source/data/brkitr/rules/word_fi_sv.txt 82@@ -38,12 +38,33 @@ $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; 83 $Format = [\p{Word_Break = Format}]; 84 $Katakana = [\p{Word_Break = Katakana}]; 85 $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; 86+ 87+# Exclude '@' (commercial at, \u0040) from ALetter to maintain breaking at '@'. 88+# ICU 49d192fefe09, in ICU 72, stopped breaking at '@' in order to not break up 89+# e-mail addresses (https://unicode-org.atlassian.net/browse/CLDR-15767). In 90+# light of the Chromium-specific change below that breaks on full-stop (period, 91+# dot, \u002e below in MidNumLet), e-mail addresses will be broken in any case. 92+# Thus, although the upstream intent was to not break "[email protected]" at 93+# all, it actually would break down into {"user", ".", "name@example", ".", 94+# "com"}, which is undesirable. See https://crbug.com/1410331. Maintain the 95+# previous Chromium behavior of breaking at both '@' and '.'. 96+# 97+# TODO: Determine whether it's feasible to drop the Chromium-specific behaviors 98+# (and thus this patch) for '.' and now '@'. 99 $ALetter = [\p{Word_Break = ALetter}]; 100+ 101 $Single_Quote = [\p{Word_Break = Single_Quote}]; 102 $Double_Quote = [\p{Word_Break = Double_Quote}]; 103-$MidNumLet = [\p{Word_Break = MidNumLet}]; 104+# Remove two full stop characters from $MidNumLet and add them to $MidNum 105+# to break a hostname into its components at the cost of breaking 106+# 'e.g.' and 'i.e.' as well. 107+# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. 108+# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected 109+# while rules 6/7 are reverted to the old behavior we want. 110+$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; 111 $MidLetter = [\p{Word_Break = MidLetter}]; 112-$MidNum = [\p{Word_Break = MidNum}]; 113+$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; 114+ 115 $Numeric = [\p{Word_Break = Numeric}]; 116 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 117 $WSegSpace = [\p{Word_Break = WSegSpace}]; 118