xref: /aosp_15_r20/external/icu/libicu/cts_headers/bocsu.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker /*
4*0e209d39SAndroid Build Coastguard Worker *******************************************************************************
5*0e209d39SAndroid Build Coastguard Worker *   Copyright (C) 2001-2014, International Business Machines
6*0e209d39SAndroid Build Coastguard Worker *   Corporation and others.  All Rights Reserved.
7*0e209d39SAndroid Build Coastguard Worker *******************************************************************************
8*0e209d39SAndroid Build Coastguard Worker *   file name:  bocsu.h
9*0e209d39SAndroid Build Coastguard Worker *   encoding:   UTF-8
10*0e209d39SAndroid Build Coastguard Worker *   tab size:   8 (not used)
11*0e209d39SAndroid Build Coastguard Worker *   indentation:4
12*0e209d39SAndroid Build Coastguard Worker *
13*0e209d39SAndroid Build Coastguard Worker *   Author: Markus W. Scherer
14*0e209d39SAndroid Build Coastguard Worker *
15*0e209d39SAndroid Build Coastguard Worker *   Modification history:
16*0e209d39SAndroid Build Coastguard Worker *   05/18/2001  weiv    Made into separate module
17*0e209d39SAndroid Build Coastguard Worker */
18*0e209d39SAndroid Build Coastguard Worker 
19*0e209d39SAndroid Build Coastguard Worker #ifndef BOCSU_H
20*0e209d39SAndroid Build Coastguard Worker #define BOCSU_H
21*0e209d39SAndroid Build Coastguard Worker 
22*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
23*0e209d39SAndroid Build Coastguard Worker 
24*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_COLLATION
25*0e209d39SAndroid Build Coastguard Worker 
26*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
27*0e209d39SAndroid Build Coastguard Worker 
28*0e209d39SAndroid Build Coastguard Worker class ByteSink;
29*0e209d39SAndroid Build Coastguard Worker 
30*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
31*0e209d39SAndroid Build Coastguard Worker 
32*0e209d39SAndroid Build Coastguard Worker /*
33*0e209d39SAndroid Build Coastguard Worker  * "BOCSU"
34*0e209d39SAndroid Build Coastguard Worker  * Binary Ordered Compression Scheme for Unicode
35*0e209d39SAndroid Build Coastguard Worker  *
36*0e209d39SAndroid Build Coastguard Worker  * Specific application:
37*0e209d39SAndroid Build Coastguard Worker  *
38*0e209d39SAndroid Build Coastguard Worker  * Encode a Unicode string for the identical level of a sort key.
39*0e209d39SAndroid Build Coastguard Worker  * Restrictions:
40*0e209d39SAndroid Build Coastguard Worker  * - byte stream (unsigned 8-bit bytes)
41*0e209d39SAndroid Build Coastguard Worker  * - lexical order of the identical-level run must be
42*0e209d39SAndroid Build Coastguard Worker  *   the same as code point order for the string
43*0e209d39SAndroid Build Coastguard Worker  * - avoid byte values 0, 1, 2
44*0e209d39SAndroid Build Coastguard Worker  *
45*0e209d39SAndroid Build Coastguard Worker  * Method: Slope Detection
46*0e209d39SAndroid Build Coastguard Worker  * Remember the previous code point (initial 0).
47*0e209d39SAndroid Build Coastguard Worker  * For each cp in the string, encode the difference to the previous one.
48*0e209d39SAndroid Build Coastguard Worker  *
49*0e209d39SAndroid Build Coastguard Worker  * With a compact encoding of differences, this yields good results for
50*0e209d39SAndroid Build Coastguard Worker  * small scripts and UTF-like results otherwise.
51*0e209d39SAndroid Build Coastguard Worker  *
52*0e209d39SAndroid Build Coastguard Worker  * Encoding of differences:
53*0e209d39SAndroid Build Coastguard Worker  * - Similar to a UTF, encoding the length of the byte sequence in the lead bytes.
54*0e209d39SAndroid Build Coastguard Worker  * - Does not need to be friendly for decoding or random access
55*0e209d39SAndroid Build Coastguard Worker  *   (trail byte values may overlap with lead/single byte values).
56*0e209d39SAndroid Build Coastguard Worker  * - The signedness must be encoded as the most significant part.
57*0e209d39SAndroid Build Coastguard Worker  *
58*0e209d39SAndroid Build Coastguard Worker  * We encode differences with few bytes if their absolute values are small.
59*0e209d39SAndroid Build Coastguard Worker  * For correct ordering, we must treat the entire value range -10ffff..+10ffff
60*0e209d39SAndroid Build Coastguard Worker  * in ascending order, which forbids encoding the sign and the absolute value separately.
61*0e209d39SAndroid Build Coastguard Worker  * Instead, we split the lead byte range in the middle and encode non-negative values
62*0e209d39SAndroid Build Coastguard Worker  * going up and negative values going down.
63*0e209d39SAndroid Build Coastguard Worker  *
64*0e209d39SAndroid Build Coastguard Worker  * For very small absolute values, the difference is added to a middle byte value
65*0e209d39SAndroid Build Coastguard Worker  * for single-byte encoded differences.
66*0e209d39SAndroid Build Coastguard Worker  * For somewhat larger absolute values, the difference is divided by the number
67*0e209d39SAndroid Build Coastguard Worker  * of byte values available, the modulo is used for one trail byte, and the remainder
68*0e209d39SAndroid Build Coastguard Worker  * is added to a lead byte avoiding the single-byte range.
69*0e209d39SAndroid Build Coastguard Worker  * For large absolute values, the difference is similarly encoded in three bytes.
70*0e209d39SAndroid Build Coastguard Worker  *
71*0e209d39SAndroid Build Coastguard Worker  * This encoding does not use byte values 0, 1, 2, but uses all other byte values
72*0e209d39SAndroid Build Coastguard Worker  * for lead/single bytes so that the middle range of single bytes is as large
73*0e209d39SAndroid Build Coastguard Worker  * as possible.
74*0e209d39SAndroid Build Coastguard Worker  * Note that the lead byte ranges overlap some, but that the sequences as a whole
75*0e209d39SAndroid Build Coastguard Worker  * are well ordered. I.e., even if the lead byte is the same for sequences of different
76*0e209d39SAndroid Build Coastguard Worker  * lengths, the trail bytes establish correct order.
77*0e209d39SAndroid Build Coastguard Worker  * It would be possible to encode slightly larger ranges for each length (>1) by
78*0e209d39SAndroid Build Coastguard Worker  * subtracting the lower bound of the range. However, that would also slow down the
79*0e209d39SAndroid Build Coastguard Worker  * calculation.
80*0e209d39SAndroid Build Coastguard Worker  *
81*0e209d39SAndroid Build Coastguard Worker  * For the actual string encoding, an optimization moves the previous code point value
82*0e209d39SAndroid Build Coastguard Worker  * to the middle of its Unicode script block to minimize the differences in
83*0e209d39SAndroid Build Coastguard Worker  * same-script text runs.
84*0e209d39SAndroid Build Coastguard Worker  */
85*0e209d39SAndroid Build Coastguard Worker 
86*0e209d39SAndroid Build Coastguard Worker /* Do not use byte values 0, 1, 2 because they are separators in sort keys. */
87*0e209d39SAndroid Build Coastguard Worker #define SLOPE_MIN           3
88*0e209d39SAndroid Build Coastguard Worker #define SLOPE_MAX           0xff
89*0e209d39SAndroid Build Coastguard Worker #define SLOPE_MIDDLE        0x81
90*0e209d39SAndroid Build Coastguard Worker 
91*0e209d39SAndroid Build Coastguard Worker #define SLOPE_TAIL_COUNT    (SLOPE_MAX-SLOPE_MIN+1)
92*0e209d39SAndroid Build Coastguard Worker 
93*0e209d39SAndroid Build Coastguard Worker #define SLOPE_MAX_BYTES     4
94*0e209d39SAndroid Build Coastguard Worker 
95*0e209d39SAndroid Build Coastguard Worker /*
96*0e209d39SAndroid Build Coastguard Worker  * Number of lead bytes:
97*0e209d39SAndroid Build Coastguard Worker  * 1        middle byte for 0
98*0e209d39SAndroid Build Coastguard Worker  * 2*80=160 single bytes for !=0
99*0e209d39SAndroid Build Coastguard Worker  * 2*42=84  for double-byte values
100*0e209d39SAndroid Build Coastguard Worker  * 2*3=6    for 3-byte values
101*0e209d39SAndroid Build Coastguard Worker  * 2*1=2    for 4-byte values
102*0e209d39SAndroid Build Coastguard Worker  *
103*0e209d39SAndroid Build Coastguard Worker  * The sum must be <=SLOPE_TAIL_COUNT.
104*0e209d39SAndroid Build Coastguard Worker  *
105*0e209d39SAndroid Build Coastguard Worker  * Why these numbers?
106*0e209d39SAndroid Build Coastguard Worker  * - There should be >=128 single-byte values to cover 128-blocks
107*0e209d39SAndroid Build Coastguard Worker  *   with small scripts.
108*0e209d39SAndroid Build Coastguard Worker  * - There should be >=20902 single/double-byte values to cover Unihan.
109*0e209d39SAndroid Build Coastguard Worker  * - It helps CJK Extension B some if there are 3-byte values that cover
110*0e209d39SAndroid Build Coastguard Worker  *   the distance between them and Unihan.
111*0e209d39SAndroid Build Coastguard Worker  *   This also helps to jump among distant places in the BMP.
112*0e209d39SAndroid Build Coastguard Worker  * - Four-byte values are necessary to cover the rest of Unicode.
113*0e209d39SAndroid Build Coastguard Worker  *
114*0e209d39SAndroid Build Coastguard Worker  * Symmetrical lead byte counts are for convenience.
115*0e209d39SAndroid Build Coastguard Worker  * With an equal distribution of even and odd differences there is also
116*0e209d39SAndroid Build Coastguard Worker  * no advantage to asymmetrical lead byte counts.
117*0e209d39SAndroid Build Coastguard Worker  */
118*0e209d39SAndroid Build Coastguard Worker #define SLOPE_SINGLE        80
119*0e209d39SAndroid Build Coastguard Worker #define SLOPE_LEAD_2        42
120*0e209d39SAndroid Build Coastguard Worker #define SLOPE_LEAD_3        3
121*0e209d39SAndroid Build Coastguard Worker #define SLOPE_LEAD_4        1
122*0e209d39SAndroid Build Coastguard Worker 
123*0e209d39SAndroid Build Coastguard Worker /* The difference value range for single-byters. */
124*0e209d39SAndroid Build Coastguard Worker #define SLOPE_REACH_POS_1   SLOPE_SINGLE
125*0e209d39SAndroid Build Coastguard Worker #define SLOPE_REACH_NEG_1   (-SLOPE_SINGLE)
126*0e209d39SAndroid Build Coastguard Worker 
127*0e209d39SAndroid Build Coastguard Worker /* The difference value range for double-byters. */
128*0e209d39SAndroid Build Coastguard Worker #define SLOPE_REACH_POS_2   (SLOPE_LEAD_2*SLOPE_TAIL_COUNT+(SLOPE_LEAD_2-1))
129*0e209d39SAndroid Build Coastguard Worker #define SLOPE_REACH_NEG_2   (-SLOPE_REACH_POS_2-1)
130*0e209d39SAndroid Build Coastguard Worker 
131*0e209d39SAndroid Build Coastguard Worker /* The difference value range for 3-byters. */
132*0e209d39SAndroid Build Coastguard Worker #define SLOPE_REACH_POS_3   (SLOPE_LEAD_3*SLOPE_TAIL_COUNT*SLOPE_TAIL_COUNT+(SLOPE_LEAD_3-1)*SLOPE_TAIL_COUNT+(SLOPE_TAIL_COUNT-1))
133*0e209d39SAndroid Build Coastguard Worker #define SLOPE_REACH_NEG_3   (-SLOPE_REACH_POS_3-1)
134*0e209d39SAndroid Build Coastguard Worker 
135*0e209d39SAndroid Build Coastguard Worker /* The lead byte start values. */
136*0e209d39SAndroid Build Coastguard Worker #define SLOPE_START_POS_2   (SLOPE_MIDDLE+SLOPE_SINGLE+1)
137*0e209d39SAndroid Build Coastguard Worker #define SLOPE_START_POS_3   (SLOPE_START_POS_2+SLOPE_LEAD_2)
138*0e209d39SAndroid Build Coastguard Worker 
139*0e209d39SAndroid Build Coastguard Worker #define SLOPE_START_NEG_2   (SLOPE_MIDDLE+SLOPE_REACH_NEG_1)
140*0e209d39SAndroid Build Coastguard Worker #define SLOPE_START_NEG_3   (SLOPE_START_NEG_2-SLOPE_LEAD_2)
141*0e209d39SAndroid Build Coastguard Worker 
142*0e209d39SAndroid Build Coastguard Worker /*
143*0e209d39SAndroid Build Coastguard Worker  * Integer division and modulo with negative numerators
144*0e209d39SAndroid Build Coastguard Worker  * yields negative modulo results and quotients that are one more than
145*0e209d39SAndroid Build Coastguard Worker  * what we need here.
146*0e209d39SAndroid Build Coastguard Worker  */
147*0e209d39SAndroid Build Coastguard Worker #define NEGDIVMOD(n, d, m) UPRV_BLOCK_MACRO_BEGIN { \
148*0e209d39SAndroid Build Coastguard Worker     (m)=(n)%(d); \
149*0e209d39SAndroid Build Coastguard Worker     (n)/=(d); \
150*0e209d39SAndroid Build Coastguard Worker     if((m)<0) { \
151*0e209d39SAndroid Build Coastguard Worker         --(n); \
152*0e209d39SAndroid Build Coastguard Worker         (m)+=(d); \
153*0e209d39SAndroid Build Coastguard Worker     } \
154*0e209d39SAndroid Build Coastguard Worker } UPRV_BLOCK_MACRO_END
155*0e209d39SAndroid Build Coastguard Worker 
156*0e209d39SAndroid Build Coastguard Worker U_CFUNC UChar32
157*0e209d39SAndroid Build Coastguard Worker u_writeIdenticalLevelRun(UChar32 prev, const UChar *s, int32_t length, icu::ByteSink &sink);
158*0e209d39SAndroid Build Coastguard Worker 
159*0e209d39SAndroid Build Coastguard Worker #endif /* #if !UCONFIG_NO_COLLATION */
160*0e209d39SAndroid Build Coastguard Worker 
161*0e209d39SAndroid Build Coastguard Worker #endif
162