1*6a54128fSAndroid Build Coastguard Worker /*
2*6a54128fSAndroid Build Coastguard Worker * Copyright (c) 2014 SGI.
3*6a54128fSAndroid Build Coastguard Worker * Copyright (c) 2018 Collabora Ltd.
4*6a54128fSAndroid Build Coastguard Worker * All rights reserved.
5*6a54128fSAndroid Build Coastguard Worker *
6*6a54128fSAndroid Build Coastguard Worker * This program is free software; you can redistribute it and/or
7*6a54128fSAndroid Build Coastguard Worker * modify it under the terms of the GNU General Public License as
8*6a54128fSAndroid Build Coastguard Worker * published by the Free Software Foundation.
9*6a54128fSAndroid Build Coastguard Worker *
10*6a54128fSAndroid Build Coastguard Worker * This program is distributed in the hope that it would be useful,
11*6a54128fSAndroid Build Coastguard Worker * but WITHOUT ANY WARRANTY; without even the implied warranty of
12*6a54128fSAndroid Build Coastguard Worker * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13*6a54128fSAndroid Build Coastguard Worker * GNU General Public License for more details.
14*6a54128fSAndroid Build Coastguard Worker *
15*6a54128fSAndroid Build Coastguard Worker */
16*6a54128fSAndroid Build Coastguard Worker
17*6a54128fSAndroid Build Coastguard Worker /*
18*6a54128fSAndroid Build Coastguard Worker * This code is adapted from the Linux Kernel. We have a
19*6a54128fSAndroid Build Coastguard Worker * userspace version here such that the hashes will match that
20*6a54128fSAndroid Build Coastguard Worker * implementation.
21*6a54128fSAndroid Build Coastguard Worker */
22*6a54128fSAndroid Build Coastguard Worker
23*6a54128fSAndroid Build Coastguard Worker #include "config.h"
24*6a54128fSAndroid Build Coastguard Worker #include <stdint.h>
25*6a54128fSAndroid Build Coastguard Worker #include <unistd.h>
26*6a54128fSAndroid Build Coastguard Worker #include <string.h>
27*6a54128fSAndroid Build Coastguard Worker #include <limits.h>
28*6a54128fSAndroid Build Coastguard Worker #include <errno.h>
29*6a54128fSAndroid Build Coastguard Worker
30*6a54128fSAndroid Build Coastguard Worker #include "ext2_fs.h"
31*6a54128fSAndroid Build Coastguard Worker #include "ext2fs.h"
32*6a54128fSAndroid Build Coastguard Worker #include "ext2fsP.h"
33*6a54128fSAndroid Build Coastguard Worker
34*6a54128fSAndroid Build Coastguard Worker /* Encoding a unicode version number as a single unsigned int. */
35*6a54128fSAndroid Build Coastguard Worker #define UNICODE_MAJ_SHIFT (16)
36*6a54128fSAndroid Build Coastguard Worker #define UNICODE_MIN_SHIFT (8)
37*6a54128fSAndroid Build Coastguard Worker
38*6a54128fSAndroid Build Coastguard Worker #define UNICODE_AGE(MAJ, MIN, REV) \
39*6a54128fSAndroid Build Coastguard Worker (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
40*6a54128fSAndroid Build Coastguard Worker ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
41*6a54128fSAndroid Build Coastguard Worker ((unsigned int)(REV)))
42*6a54128fSAndroid Build Coastguard Worker
43*6a54128fSAndroid Build Coastguard Worker /* Needed in struct utf8cursor below. */
44*6a54128fSAndroid Build Coastguard Worker #define UTF8HANGULLEAF (12)
45*6a54128fSAndroid Build Coastguard Worker
46*6a54128fSAndroid Build Coastguard Worker /*
47*6a54128fSAndroid Build Coastguard Worker * Cursor structure used by the normalizer.
48*6a54128fSAndroid Build Coastguard Worker */
49*6a54128fSAndroid Build Coastguard Worker struct utf8cursor {
50*6a54128fSAndroid Build Coastguard Worker const struct utf8data *data;
51*6a54128fSAndroid Build Coastguard Worker const char *s;
52*6a54128fSAndroid Build Coastguard Worker const char *p;
53*6a54128fSAndroid Build Coastguard Worker const char *ss;
54*6a54128fSAndroid Build Coastguard Worker const char *sp;
55*6a54128fSAndroid Build Coastguard Worker unsigned int len;
56*6a54128fSAndroid Build Coastguard Worker unsigned int slen;
57*6a54128fSAndroid Build Coastguard Worker short int ccc;
58*6a54128fSAndroid Build Coastguard Worker short int nccc;
59*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
60*6a54128fSAndroid Build Coastguard Worker };
61*6a54128fSAndroid Build Coastguard Worker
62*6a54128fSAndroid Build Coastguard Worker /*
63*6a54128fSAndroid Build Coastguard Worker * Initialize a utf8cursor to normalize a string.
64*6a54128fSAndroid Build Coastguard Worker * Returns 0 on success.
65*6a54128fSAndroid Build Coastguard Worker * Returns -1 on failure.
66*6a54128fSAndroid Build Coastguard Worker */
67*6a54128fSAndroid Build Coastguard Worker // extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
68*6a54128fSAndroid Build Coastguard Worker // const char *s);
69*6a54128fSAndroid Build Coastguard Worker // extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
70*6a54128fSAndroid Build Coastguard Worker // const char *s, size_t len);
71*6a54128fSAndroid Build Coastguard Worker
72*6a54128fSAndroid Build Coastguard Worker /*
73*6a54128fSAndroid Build Coastguard Worker * Get the next byte in the normalization.
74*6a54128fSAndroid Build Coastguard Worker * Returns a value > 0 && < 256 on success.
75*6a54128fSAndroid Build Coastguard Worker * Returns 0 when the end of the normalization is reached.
76*6a54128fSAndroid Build Coastguard Worker * Returns -1 if the string being normalized is not valid UTF-8.
77*6a54128fSAndroid Build Coastguard Worker */
78*6a54128fSAndroid Build Coastguard Worker // extern int utf8byte(struct utf8cursor *u8c);
79*6a54128fSAndroid Build Coastguard Worker
80*6a54128fSAndroid Build Coastguard Worker
81*6a54128fSAndroid Build Coastguard Worker struct utf8data {
82*6a54128fSAndroid Build Coastguard Worker unsigned int maxage;
83*6a54128fSAndroid Build Coastguard Worker unsigned int offset;
84*6a54128fSAndroid Build Coastguard Worker };
85*6a54128fSAndroid Build Coastguard Worker
86*6a54128fSAndroid Build Coastguard Worker #define __INCLUDED_FROM_UTF8NORM_C__
87*6a54128fSAndroid Build Coastguard Worker #include "utf8data.h"
88*6a54128fSAndroid Build Coastguard Worker #undef __INCLUDED_FROM_UTF8NORM_C__
89*6a54128fSAndroid Build Coastguard Worker
90*6a54128fSAndroid Build Coastguard Worker #define ARRAY_SIZE(array) \
91*6a54128fSAndroid Build Coastguard Worker (sizeof(array) / sizeof(array[0]))
92*6a54128fSAndroid Build Coastguard Worker
93*6a54128fSAndroid Build Coastguard Worker #if 0
94*6a54128fSAndroid Build Coastguard Worker /* Highest unicode version supported by the data tables. */
95*6a54128fSAndroid Build Coastguard Worker static int utf8version_is_supported(uint8_t maj, uint8_t min, uint8_t rev)
96*6a54128fSAndroid Build Coastguard Worker {
97*6a54128fSAndroid Build Coastguard Worker int i = ARRAY_SIZE(utf8agetab) - 1;
98*6a54128fSAndroid Build Coastguard Worker unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
99*6a54128fSAndroid Build Coastguard Worker
100*6a54128fSAndroid Build Coastguard Worker while (i >= 0 && utf8agetab[i] != 0) {
101*6a54128fSAndroid Build Coastguard Worker if (sb_utf8version == utf8agetab[i])
102*6a54128fSAndroid Build Coastguard Worker return 1;
103*6a54128fSAndroid Build Coastguard Worker i--;
104*6a54128fSAndroid Build Coastguard Worker }
105*6a54128fSAndroid Build Coastguard Worker return 0;
106*6a54128fSAndroid Build Coastguard Worker }
107*6a54128fSAndroid Build Coastguard Worker #endif
108*6a54128fSAndroid Build Coastguard Worker
109*6a54128fSAndroid Build Coastguard Worker #if 0
110*6a54128fSAndroid Build Coastguard Worker static int utf8version_latest(void)
111*6a54128fSAndroid Build Coastguard Worker {
112*6a54128fSAndroid Build Coastguard Worker return utf8vers;
113*6a54128fSAndroid Build Coastguard Worker }
114*6a54128fSAndroid Build Coastguard Worker #endif
115*6a54128fSAndroid Build Coastguard Worker
116*6a54128fSAndroid Build Coastguard Worker /*
117*6a54128fSAndroid Build Coastguard Worker * UTF-8 valid ranges.
118*6a54128fSAndroid Build Coastguard Worker *
119*6a54128fSAndroid Build Coastguard Worker * The UTF-8 encoding spreads the bits of a 32bit word over several
120*6a54128fSAndroid Build Coastguard Worker * bytes. This table gives the ranges that can be held and how they'd
121*6a54128fSAndroid Build Coastguard Worker * be represented.
122*6a54128fSAndroid Build Coastguard Worker *
123*6a54128fSAndroid Build Coastguard Worker * 0x00000000 0x0000007F: 0xxxxxxx
124*6a54128fSAndroid Build Coastguard Worker * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
125*6a54128fSAndroid Build Coastguard Worker * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
126*6a54128fSAndroid Build Coastguard Worker * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
127*6a54128fSAndroid Build Coastguard Worker * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
128*6a54128fSAndroid Build Coastguard Worker * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
129*6a54128fSAndroid Build Coastguard Worker *
130*6a54128fSAndroid Build Coastguard Worker * There is an additional requirement on UTF-8, in that only the
131*6a54128fSAndroid Build Coastguard Worker * shortest representation of a 32bit value is to be used. A decoder
132*6a54128fSAndroid Build Coastguard Worker * must not decode sequences that do not satisfy this requirement.
133*6a54128fSAndroid Build Coastguard Worker * Thus the allowed ranges have a lower bound.
134*6a54128fSAndroid Build Coastguard Worker *
135*6a54128fSAndroid Build Coastguard Worker * 0x00000000 0x0000007F: 0xxxxxxx
136*6a54128fSAndroid Build Coastguard Worker * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
137*6a54128fSAndroid Build Coastguard Worker * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
138*6a54128fSAndroid Build Coastguard Worker * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
139*6a54128fSAndroid Build Coastguard Worker * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
140*6a54128fSAndroid Build Coastguard Worker * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
141*6a54128fSAndroid Build Coastguard Worker *
142*6a54128fSAndroid Build Coastguard Worker * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
143*6a54128fSAndroid Build Coastguard Worker * 17 planes of 65536 values. This limits the sequences actually seen
144*6a54128fSAndroid Build Coastguard Worker * even more, to just the following.
145*6a54128fSAndroid Build Coastguard Worker *
146*6a54128fSAndroid Build Coastguard Worker * 0 - 0x7F: 0 - 0x7F
147*6a54128fSAndroid Build Coastguard Worker * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF
148*6a54128fSAndroid Build Coastguard Worker * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF
149*6a54128fSAndroid Build Coastguard Worker * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
150*6a54128fSAndroid Build Coastguard Worker *
151*6a54128fSAndroid Build Coastguard Worker * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
152*6a54128fSAndroid Build Coastguard Worker *
153*6a54128fSAndroid Build Coastguard Worker * Note that the longest sequence seen with valid usage is 4 bytes,
154*6a54128fSAndroid Build Coastguard Worker * the same a single UTF-32 character. This makes the UTF-8
155*6a54128fSAndroid Build Coastguard Worker * representation of Unicode strictly smaller than UTF-32.
156*6a54128fSAndroid Build Coastguard Worker *
157*6a54128fSAndroid Build Coastguard Worker * The shortest sequence requirement was introduced by:
158*6a54128fSAndroid Build Coastguard Worker * Corrigendum #1: UTF-8 Shortest Form
159*6a54128fSAndroid Build Coastguard Worker * It can be found here:
160*6a54128fSAndroid Build Coastguard Worker * http://www.unicode.org/versions/corrigendum1.html
161*6a54128fSAndroid Build Coastguard Worker *
162*6a54128fSAndroid Build Coastguard Worker */
163*6a54128fSAndroid Build Coastguard Worker
164*6a54128fSAndroid Build Coastguard Worker /*
165*6a54128fSAndroid Build Coastguard Worker * Return the number of bytes used by the current UTF-8 sequence.
166*6a54128fSAndroid Build Coastguard Worker * Assumes the input points to the first byte of a valid UTF-8
167*6a54128fSAndroid Build Coastguard Worker * sequence.
168*6a54128fSAndroid Build Coastguard Worker */
utf8clen(const char * s)169*6a54128fSAndroid Build Coastguard Worker static inline int utf8clen(const char *s)
170*6a54128fSAndroid Build Coastguard Worker {
171*6a54128fSAndroid Build Coastguard Worker unsigned char c = *s;
172*6a54128fSAndroid Build Coastguard Worker
173*6a54128fSAndroid Build Coastguard Worker return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
174*6a54128fSAndroid Build Coastguard Worker }
175*6a54128fSAndroid Build Coastguard Worker
176*6a54128fSAndroid Build Coastguard Worker /*
177*6a54128fSAndroid Build Coastguard Worker * Decode a 3-byte UTF-8 sequence.
178*6a54128fSAndroid Build Coastguard Worker */
179*6a54128fSAndroid Build Coastguard Worker static unsigned int
utf8decode3(const char * str)180*6a54128fSAndroid Build Coastguard Worker utf8decode3(const char *str)
181*6a54128fSAndroid Build Coastguard Worker {
182*6a54128fSAndroid Build Coastguard Worker unsigned int uc;
183*6a54128fSAndroid Build Coastguard Worker
184*6a54128fSAndroid Build Coastguard Worker uc = *str++ & 0x0F;
185*6a54128fSAndroid Build Coastguard Worker uc <<= 6;
186*6a54128fSAndroid Build Coastguard Worker uc |= *str++ & 0x3F;
187*6a54128fSAndroid Build Coastguard Worker uc <<= 6;
188*6a54128fSAndroid Build Coastguard Worker uc |= *str++ & 0x3F;
189*6a54128fSAndroid Build Coastguard Worker
190*6a54128fSAndroid Build Coastguard Worker return uc;
191*6a54128fSAndroid Build Coastguard Worker }
192*6a54128fSAndroid Build Coastguard Worker
193*6a54128fSAndroid Build Coastguard Worker /*
194*6a54128fSAndroid Build Coastguard Worker * Encode a 3-byte UTF-8 sequence.
195*6a54128fSAndroid Build Coastguard Worker */
196*6a54128fSAndroid Build Coastguard Worker static int
utf8encode3(char * str,unsigned int val)197*6a54128fSAndroid Build Coastguard Worker utf8encode3(char *str, unsigned int val)
198*6a54128fSAndroid Build Coastguard Worker {
199*6a54128fSAndroid Build Coastguard Worker str[2] = (val & 0x3F) | 0x80;
200*6a54128fSAndroid Build Coastguard Worker val >>= 6;
201*6a54128fSAndroid Build Coastguard Worker str[1] = (val & 0x3F) | 0x80;
202*6a54128fSAndroid Build Coastguard Worker val >>= 6;
203*6a54128fSAndroid Build Coastguard Worker str[0] = val | 0xE0;
204*6a54128fSAndroid Build Coastguard Worker
205*6a54128fSAndroid Build Coastguard Worker return 3;
206*6a54128fSAndroid Build Coastguard Worker }
207*6a54128fSAndroid Build Coastguard Worker
208*6a54128fSAndroid Build Coastguard Worker /*
209*6a54128fSAndroid Build Coastguard Worker * utf8trie_t
210*6a54128fSAndroid Build Coastguard Worker *
211*6a54128fSAndroid Build Coastguard Worker * A compact binary tree, used to decode UTF-8 characters.
212*6a54128fSAndroid Build Coastguard Worker *
213*6a54128fSAndroid Build Coastguard Worker * Internal nodes are one byte for the node itself, and up to three
214*6a54128fSAndroid Build Coastguard Worker * bytes for an offset into the tree. The first byte contains the
215*6a54128fSAndroid Build Coastguard Worker * following information:
216*6a54128fSAndroid Build Coastguard Worker * NEXTBYTE - flag - advance to next byte if set
217*6a54128fSAndroid Build Coastguard Worker * BITNUM - 3 bit field - the bit number to tested
218*6a54128fSAndroid Build Coastguard Worker * OFFLEN - 2 bit field - number of bytes in the offset
219*6a54128fSAndroid Build Coastguard Worker * if offlen == 0 (non-branching node)
220*6a54128fSAndroid Build Coastguard Worker * RIGHTPATH - 1 bit field - set if the following node is for the
221*6a54128fSAndroid Build Coastguard Worker * right-hand path (tested bit is set)
222*6a54128fSAndroid Build Coastguard Worker * TRIENODE - 1 bit field - set if the following node is an internal
223*6a54128fSAndroid Build Coastguard Worker * node, otherwise it is a leaf node
224*6a54128fSAndroid Build Coastguard Worker * if offlen != 0 (branching node)
225*6a54128fSAndroid Build Coastguard Worker * LEFTNODE - 1 bit field - set if the left-hand node is internal
226*6a54128fSAndroid Build Coastguard Worker * RIGHTNODE - 1 bit field - set if the right-hand node is internal
227*6a54128fSAndroid Build Coastguard Worker *
228*6a54128fSAndroid Build Coastguard Worker * Due to the way utf8 works, there cannot be branching nodes with
229*6a54128fSAndroid Build Coastguard Worker * NEXTBYTE set, and moreover those nodes always have a righthand
230*6a54128fSAndroid Build Coastguard Worker * descendant.
231*6a54128fSAndroid Build Coastguard Worker */
232*6a54128fSAndroid Build Coastguard Worker typedef const unsigned char utf8trie_t;
233*6a54128fSAndroid Build Coastguard Worker #define BITNUM 0x07
234*6a54128fSAndroid Build Coastguard Worker #define NEXTBYTE 0x08
235*6a54128fSAndroid Build Coastguard Worker #define OFFLEN 0x30
236*6a54128fSAndroid Build Coastguard Worker #define OFFLEN_SHIFT 4
237*6a54128fSAndroid Build Coastguard Worker #define RIGHTPATH 0x40
238*6a54128fSAndroid Build Coastguard Worker #define TRIENODE 0x80
239*6a54128fSAndroid Build Coastguard Worker #define RIGHTNODE 0x40
240*6a54128fSAndroid Build Coastguard Worker #define LEFTNODE 0x80
241*6a54128fSAndroid Build Coastguard Worker
242*6a54128fSAndroid Build Coastguard Worker /*
243*6a54128fSAndroid Build Coastguard Worker * utf8leaf_t
244*6a54128fSAndroid Build Coastguard Worker *
245*6a54128fSAndroid Build Coastguard Worker * The leaves of the trie are embedded in the trie, and so the same
246*6a54128fSAndroid Build Coastguard Worker * underlying datatype: unsigned char.
247*6a54128fSAndroid Build Coastguard Worker *
248*6a54128fSAndroid Build Coastguard Worker * leaf[0]: The unicode version, stored as a generation number that is
249*6a54128fSAndroid Build Coastguard Worker * an index into utf8agetab[]. With this we can filter code
250*6a54128fSAndroid Build Coastguard Worker * points based on the unicode version in which they were
251*6a54128fSAndroid Build Coastguard Worker * defined. The CCC of a non-defined code point is 0.
252*6a54128fSAndroid Build Coastguard Worker * leaf[1]: Canonical Combining Class. During normalization, we need
253*6a54128fSAndroid Build Coastguard Worker * to do a stable sort into ascending order of all characters
254*6a54128fSAndroid Build Coastguard Worker * with a non-zero CCC that occur between two characters with
255*6a54128fSAndroid Build Coastguard Worker * a CCC of 0, or at the begin or end of a string.
256*6a54128fSAndroid Build Coastguard Worker * The unicode standard guarantees that all CCC values are
257*6a54128fSAndroid Build Coastguard Worker * between 0 and 254 inclusive, which leaves 255 available as
258*6a54128fSAndroid Build Coastguard Worker * a special value.
259*6a54128fSAndroid Build Coastguard Worker * Code points with CCC 0 are known as stoppers.
260*6a54128fSAndroid Build Coastguard Worker * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
261*6a54128fSAndroid Build Coastguard Worker * start of a NUL-terminated string that is the decomposition
262*6a54128fSAndroid Build Coastguard Worker * of the character.
263*6a54128fSAndroid Build Coastguard Worker * The CCC of a decomposable character is the same as the CCC
264*6a54128fSAndroid Build Coastguard Worker * of the first character of its decomposition.
265*6a54128fSAndroid Build Coastguard Worker * Some characters decompose as the empty string: these are
266*6a54128fSAndroid Build Coastguard Worker * characters with the Default_Ignorable_Code_Point property.
267*6a54128fSAndroid Build Coastguard Worker * These do affect normalization, as they all have CCC 0.
268*6a54128fSAndroid Build Coastguard Worker *
269*6a54128fSAndroid Build Coastguard Worker * The decompositions in the trie have been fully expanded, with the
270*6a54128fSAndroid Build Coastguard Worker * exception of Hangul syllables, which are decomposed algorithmically.
271*6a54128fSAndroid Build Coastguard Worker *
272*6a54128fSAndroid Build Coastguard Worker * Casefolding, if applicable, is also done using decompositions.
273*6a54128fSAndroid Build Coastguard Worker *
274*6a54128fSAndroid Build Coastguard Worker * The trie is constructed in such a way that leaves exist for all
275*6a54128fSAndroid Build Coastguard Worker * UTF-8 sequences that match the criteria from the "UTF-8 valid
276*6a54128fSAndroid Build Coastguard Worker * ranges" comment above, and only for those sequences. Therefore a
277*6a54128fSAndroid Build Coastguard Worker * lookup in the trie can be used to validate the UTF-8 input.
278*6a54128fSAndroid Build Coastguard Worker */
279*6a54128fSAndroid Build Coastguard Worker typedef const unsigned char utf8leaf_t;
280*6a54128fSAndroid Build Coastguard Worker
281*6a54128fSAndroid Build Coastguard Worker #define LEAF_GEN(LEAF) ((LEAF)[0])
282*6a54128fSAndroid Build Coastguard Worker #define LEAF_CCC(LEAF) ((LEAF)[1])
283*6a54128fSAndroid Build Coastguard Worker #define LEAF_STR(LEAF) ((const char *)((LEAF) + 2))
284*6a54128fSAndroid Build Coastguard Worker
285*6a54128fSAndroid Build Coastguard Worker #define MINCCC (0)
286*6a54128fSAndroid Build Coastguard Worker #define MAXCCC (254)
287*6a54128fSAndroid Build Coastguard Worker #define STOPPER (0)
288*6a54128fSAndroid Build Coastguard Worker #define DECOMPOSE (255)
289*6a54128fSAndroid Build Coastguard Worker
290*6a54128fSAndroid Build Coastguard Worker /* Marker for hangul syllable decomposition. */
291*6a54128fSAndroid Build Coastguard Worker #define HANGUL ((char)(255))
292*6a54128fSAndroid Build Coastguard Worker /* Size of the synthesized leaf used for Hangul syllable decomposition. */
293*6a54128fSAndroid Build Coastguard Worker #define UTF8HANGULLEAF (12)
294*6a54128fSAndroid Build Coastguard Worker
295*6a54128fSAndroid Build Coastguard Worker /*
296*6a54128fSAndroid Build Coastguard Worker * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
297*6a54128fSAndroid Build Coastguard Worker *
298*6a54128fSAndroid Build Coastguard Worker * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
299*6a54128fSAndroid Build Coastguard Worker * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
300*6a54128fSAndroid Build Coastguard Worker *
301*6a54128fSAndroid Build Coastguard Worker * SBase = 0xAC00
302*6a54128fSAndroid Build Coastguard Worker * LBase = 0x1100
303*6a54128fSAndroid Build Coastguard Worker * VBase = 0x1161
304*6a54128fSAndroid Build Coastguard Worker * TBase = 0x11A7
305*6a54128fSAndroid Build Coastguard Worker * LCount = 19
306*6a54128fSAndroid Build Coastguard Worker * VCount = 21
307*6a54128fSAndroid Build Coastguard Worker * TCount = 28
308*6a54128fSAndroid Build Coastguard Worker * NCount = 588 (VCount * TCount)
309*6a54128fSAndroid Build Coastguard Worker * SCount = 11172 (LCount * NCount)
310*6a54128fSAndroid Build Coastguard Worker *
311*6a54128fSAndroid Build Coastguard Worker * Decomposition:
312*6a54128fSAndroid Build Coastguard Worker * SIndex = s - SBase
313*6a54128fSAndroid Build Coastguard Worker *
314*6a54128fSAndroid Build Coastguard Worker * LV (Canonical/Full)
315*6a54128fSAndroid Build Coastguard Worker * LIndex = SIndex / NCount
316*6a54128fSAndroid Build Coastguard Worker * VIndex = (Sindex % NCount) / TCount
317*6a54128fSAndroid Build Coastguard Worker * LPart = LBase + LIndex
318*6a54128fSAndroid Build Coastguard Worker * VPart = VBase + VIndex
319*6a54128fSAndroid Build Coastguard Worker *
320*6a54128fSAndroid Build Coastguard Worker * LVT (Canonical)
321*6a54128fSAndroid Build Coastguard Worker * LVIndex = (SIndex / TCount) * TCount
322*6a54128fSAndroid Build Coastguard Worker * TIndex = (Sindex % TCount)
323*6a54128fSAndroid Build Coastguard Worker * LVPart = SBase + LVIndex
324*6a54128fSAndroid Build Coastguard Worker * TPart = TBase + TIndex
325*6a54128fSAndroid Build Coastguard Worker *
326*6a54128fSAndroid Build Coastguard Worker * LVT (Full)
327*6a54128fSAndroid Build Coastguard Worker * LIndex = SIndex / NCount
328*6a54128fSAndroid Build Coastguard Worker * VIndex = (Sindex % NCount) / TCount
329*6a54128fSAndroid Build Coastguard Worker * TIndex = (Sindex % TCount)
330*6a54128fSAndroid Build Coastguard Worker * LPart = LBase + LIndex
331*6a54128fSAndroid Build Coastguard Worker * VPart = VBase + VIndex
332*6a54128fSAndroid Build Coastguard Worker * if (TIndex == 0) {
333*6a54128fSAndroid Build Coastguard Worker * d = <LPart, VPart>
334*6a54128fSAndroid Build Coastguard Worker * } else {
335*6a54128fSAndroid Build Coastguard Worker * TPart = TBase + TIndex
336*6a54128fSAndroid Build Coastguard Worker * d = <LPart, TPart, VPart>
337*6a54128fSAndroid Build Coastguard Worker * }
338*6a54128fSAndroid Build Coastguard Worker */
339*6a54128fSAndroid Build Coastguard Worker
340*6a54128fSAndroid Build Coastguard Worker /* Constants */
341*6a54128fSAndroid Build Coastguard Worker #define SB (0xAC00)
342*6a54128fSAndroid Build Coastguard Worker #define LB (0x1100)
343*6a54128fSAndroid Build Coastguard Worker #define VB (0x1161)
344*6a54128fSAndroid Build Coastguard Worker #define TB (0x11A7)
345*6a54128fSAndroid Build Coastguard Worker #define LC (19)
346*6a54128fSAndroid Build Coastguard Worker #define VC (21)
347*6a54128fSAndroid Build Coastguard Worker #define TC (28)
348*6a54128fSAndroid Build Coastguard Worker #define NC (VC * TC)
349*6a54128fSAndroid Build Coastguard Worker #define SC (LC * NC)
350*6a54128fSAndroid Build Coastguard Worker
351*6a54128fSAndroid Build Coastguard Worker /* Algorithmic decomposition of hangul syllable. */
352*6a54128fSAndroid Build Coastguard Worker static utf8leaf_t *
utf8hangul(const char * str,unsigned char * hangul)353*6a54128fSAndroid Build Coastguard Worker utf8hangul(const char *str, unsigned char *hangul)
354*6a54128fSAndroid Build Coastguard Worker {
355*6a54128fSAndroid Build Coastguard Worker unsigned int si;
356*6a54128fSAndroid Build Coastguard Worker unsigned int li;
357*6a54128fSAndroid Build Coastguard Worker unsigned int vi;
358*6a54128fSAndroid Build Coastguard Worker unsigned int ti;
359*6a54128fSAndroid Build Coastguard Worker unsigned char *h;
360*6a54128fSAndroid Build Coastguard Worker
361*6a54128fSAndroid Build Coastguard Worker /* Calculate the SI, LI, VI, and TI values. */
362*6a54128fSAndroid Build Coastguard Worker si = utf8decode3(str) - SB;
363*6a54128fSAndroid Build Coastguard Worker li = si / NC;
364*6a54128fSAndroid Build Coastguard Worker vi = (si % NC) / TC;
365*6a54128fSAndroid Build Coastguard Worker ti = si % TC;
366*6a54128fSAndroid Build Coastguard Worker
367*6a54128fSAndroid Build Coastguard Worker /* Fill in base of leaf. */
368*6a54128fSAndroid Build Coastguard Worker h = hangul;
369*6a54128fSAndroid Build Coastguard Worker LEAF_GEN(h) = 2;
370*6a54128fSAndroid Build Coastguard Worker LEAF_CCC(h) = DECOMPOSE;
371*6a54128fSAndroid Build Coastguard Worker h += 2;
372*6a54128fSAndroid Build Coastguard Worker
373*6a54128fSAndroid Build Coastguard Worker /* Add LPart, a 3-byte UTF-8 sequence. */
374*6a54128fSAndroid Build Coastguard Worker h += utf8encode3((char *)h, li + LB);
375*6a54128fSAndroid Build Coastguard Worker
376*6a54128fSAndroid Build Coastguard Worker /* Add VPart, a 3-byte UTF-8 sequence. */
377*6a54128fSAndroid Build Coastguard Worker h += utf8encode3((char *)h, vi + VB);
378*6a54128fSAndroid Build Coastguard Worker
379*6a54128fSAndroid Build Coastguard Worker /* Add TPart if required, also a 3-byte UTF-8 sequence. */
380*6a54128fSAndroid Build Coastguard Worker if (ti)
381*6a54128fSAndroid Build Coastguard Worker h += utf8encode3((char *)h, ti + TB);
382*6a54128fSAndroid Build Coastguard Worker
383*6a54128fSAndroid Build Coastguard Worker /* Terminate string. */
384*6a54128fSAndroid Build Coastguard Worker h[0] = '\0';
385*6a54128fSAndroid Build Coastguard Worker
386*6a54128fSAndroid Build Coastguard Worker return hangul;
387*6a54128fSAndroid Build Coastguard Worker }
388*6a54128fSAndroid Build Coastguard Worker
389*6a54128fSAndroid Build Coastguard Worker /*
390*6a54128fSAndroid Build Coastguard Worker * Use trie to scan s, touching at most len bytes.
391*6a54128fSAndroid Build Coastguard Worker * Returns the leaf if one exists, NULL otherwise.
392*6a54128fSAndroid Build Coastguard Worker *
393*6a54128fSAndroid Build Coastguard Worker * A non-NULL return guarantees that the UTF-8 sequence starting at s
394*6a54128fSAndroid Build Coastguard Worker * is well-formed and corresponds to a known unicode code point. The
395*6a54128fSAndroid Build Coastguard Worker * shorthand for this will be "is valid UTF-8 unicode".
396*6a54128fSAndroid Build Coastguard Worker */
utf8nlookup(const struct utf8data * data,unsigned char * hangul,const char * s,size_t len)397*6a54128fSAndroid Build Coastguard Worker static utf8leaf_t *utf8nlookup(const struct utf8data *data,
398*6a54128fSAndroid Build Coastguard Worker unsigned char *hangul, const char *s, size_t len)
399*6a54128fSAndroid Build Coastguard Worker {
400*6a54128fSAndroid Build Coastguard Worker utf8trie_t *trie;
401*6a54128fSAndroid Build Coastguard Worker int offlen;
402*6a54128fSAndroid Build Coastguard Worker int offset;
403*6a54128fSAndroid Build Coastguard Worker int mask;
404*6a54128fSAndroid Build Coastguard Worker int node;
405*6a54128fSAndroid Build Coastguard Worker
406*6a54128fSAndroid Build Coastguard Worker if (!data)
407*6a54128fSAndroid Build Coastguard Worker return NULL;
408*6a54128fSAndroid Build Coastguard Worker if (len == 0)
409*6a54128fSAndroid Build Coastguard Worker return NULL;
410*6a54128fSAndroid Build Coastguard Worker
411*6a54128fSAndroid Build Coastguard Worker trie = utf8data + data->offset;
412*6a54128fSAndroid Build Coastguard Worker node = 1;
413*6a54128fSAndroid Build Coastguard Worker while (node) {
414*6a54128fSAndroid Build Coastguard Worker offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
415*6a54128fSAndroid Build Coastguard Worker if (*trie & NEXTBYTE) {
416*6a54128fSAndroid Build Coastguard Worker if (--len == 0)
417*6a54128fSAndroid Build Coastguard Worker return NULL;
418*6a54128fSAndroid Build Coastguard Worker s++;
419*6a54128fSAndroid Build Coastguard Worker }
420*6a54128fSAndroid Build Coastguard Worker mask = 1 << (*trie & BITNUM);
421*6a54128fSAndroid Build Coastguard Worker if (*s & mask) {
422*6a54128fSAndroid Build Coastguard Worker /* Right leg */
423*6a54128fSAndroid Build Coastguard Worker if (offlen) {
424*6a54128fSAndroid Build Coastguard Worker /* Right node at offset of trie */
425*6a54128fSAndroid Build Coastguard Worker node = (*trie & RIGHTNODE);
426*6a54128fSAndroid Build Coastguard Worker offset = trie[offlen];
427*6a54128fSAndroid Build Coastguard Worker while (--offlen) {
428*6a54128fSAndroid Build Coastguard Worker offset <<= 8;
429*6a54128fSAndroid Build Coastguard Worker offset |= trie[offlen];
430*6a54128fSAndroid Build Coastguard Worker }
431*6a54128fSAndroid Build Coastguard Worker trie += offset;
432*6a54128fSAndroid Build Coastguard Worker } else if (*trie & RIGHTPATH) {
433*6a54128fSAndroid Build Coastguard Worker /* Right node after this node */
434*6a54128fSAndroid Build Coastguard Worker node = (*trie & TRIENODE);
435*6a54128fSAndroid Build Coastguard Worker trie++;
436*6a54128fSAndroid Build Coastguard Worker } else {
437*6a54128fSAndroid Build Coastguard Worker /* No right node. */
438*6a54128fSAndroid Build Coastguard Worker return NULL;
439*6a54128fSAndroid Build Coastguard Worker }
440*6a54128fSAndroid Build Coastguard Worker } else {
441*6a54128fSAndroid Build Coastguard Worker /* Left leg */
442*6a54128fSAndroid Build Coastguard Worker if (offlen) {
443*6a54128fSAndroid Build Coastguard Worker /* Left node after this node. */
444*6a54128fSAndroid Build Coastguard Worker node = (*trie & LEFTNODE);
445*6a54128fSAndroid Build Coastguard Worker trie += offlen + 1;
446*6a54128fSAndroid Build Coastguard Worker } else if (*trie & RIGHTPATH) {
447*6a54128fSAndroid Build Coastguard Worker /* No left node. */
448*6a54128fSAndroid Build Coastguard Worker return NULL;
449*6a54128fSAndroid Build Coastguard Worker } else {
450*6a54128fSAndroid Build Coastguard Worker /* Left node after this node */
451*6a54128fSAndroid Build Coastguard Worker node = (*trie & TRIENODE);
452*6a54128fSAndroid Build Coastguard Worker trie++;
453*6a54128fSAndroid Build Coastguard Worker }
454*6a54128fSAndroid Build Coastguard Worker }
455*6a54128fSAndroid Build Coastguard Worker }
456*6a54128fSAndroid Build Coastguard Worker /*
457*6a54128fSAndroid Build Coastguard Worker * Hangul decomposition is done algorithmically. These are the
458*6a54128fSAndroid Build Coastguard Worker * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
459*6a54128fSAndroid Build Coastguard Worker * always 3 bytes long, so s has been advanced twice, and the
460*6a54128fSAndroid Build Coastguard Worker * start of the sequence is at s-2.
461*6a54128fSAndroid Build Coastguard Worker */
462*6a54128fSAndroid Build Coastguard Worker if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
463*6a54128fSAndroid Build Coastguard Worker trie = utf8hangul(s - 2, hangul);
464*6a54128fSAndroid Build Coastguard Worker return trie;
465*6a54128fSAndroid Build Coastguard Worker }
466*6a54128fSAndroid Build Coastguard Worker
467*6a54128fSAndroid Build Coastguard Worker /*
468*6a54128fSAndroid Build Coastguard Worker * Use trie to scan s.
469*6a54128fSAndroid Build Coastguard Worker * Returns the leaf if one exists, NULL otherwise.
470*6a54128fSAndroid Build Coastguard Worker *
471*6a54128fSAndroid Build Coastguard Worker * Forwards to utf8nlookup().
472*6a54128fSAndroid Build Coastguard Worker */
utf8lookup(const struct utf8data * data,unsigned char * hangul,const char * s)473*6a54128fSAndroid Build Coastguard Worker static utf8leaf_t *utf8lookup(const struct utf8data *data,
474*6a54128fSAndroid Build Coastguard Worker unsigned char *hangul, const char *s)
475*6a54128fSAndroid Build Coastguard Worker {
476*6a54128fSAndroid Build Coastguard Worker return utf8nlookup(data, hangul, s, (size_t)-1);
477*6a54128fSAndroid Build Coastguard Worker }
478*6a54128fSAndroid Build Coastguard Worker
479*6a54128fSAndroid Build Coastguard Worker #if 0
480*6a54128fSAndroid Build Coastguard Worker /*
481*6a54128fSAndroid Build Coastguard Worker * Maximum age of any character in s.
482*6a54128fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
483*6a54128fSAndroid Build Coastguard Worker * Return 0 if only non-assigned code points are used.
484*6a54128fSAndroid Build Coastguard Worker */
485*6a54128fSAndroid Build Coastguard Worker static int utf8agemax(const struct utf8data *data, const char *s)
486*6a54128fSAndroid Build Coastguard Worker {
487*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
488*6a54128fSAndroid Build Coastguard Worker int age = 0;
489*6a54128fSAndroid Build Coastguard Worker int leaf_age;
490*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
491*6a54128fSAndroid Build Coastguard Worker
492*6a54128fSAndroid Build Coastguard Worker if (!data)
493*6a54128fSAndroid Build Coastguard Worker return -1;
494*6a54128fSAndroid Build Coastguard Worker
495*6a54128fSAndroid Build Coastguard Worker while (*s) {
496*6a54128fSAndroid Build Coastguard Worker leaf = utf8lookup(data, hangul, s);
497*6a54128fSAndroid Build Coastguard Worker if (!leaf)
498*6a54128fSAndroid Build Coastguard Worker return -1;
499*6a54128fSAndroid Build Coastguard Worker
500*6a54128fSAndroid Build Coastguard Worker leaf_age = utf8agetab[LEAF_GEN(leaf)];
501*6a54128fSAndroid Build Coastguard Worker if (leaf_age <= data->maxage && leaf_age > age)
502*6a54128fSAndroid Build Coastguard Worker age = leaf_age;
503*6a54128fSAndroid Build Coastguard Worker s += utf8clen(s);
504*6a54128fSAndroid Build Coastguard Worker }
505*6a54128fSAndroid Build Coastguard Worker return age;
506*6a54128fSAndroid Build Coastguard Worker }
507*6a54128fSAndroid Build Coastguard Worker #endif
508*6a54128fSAndroid Build Coastguard Worker
509*6a54128fSAndroid Build Coastguard Worker #if 0
510*6a54128fSAndroid Build Coastguard Worker /*
511*6a54128fSAndroid Build Coastguard Worker * Minimum age of any character in s.
512*6a54128fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
513*6a54128fSAndroid Build Coastguard Worker * Return 0 if non-assigned code points are used.
514*6a54128fSAndroid Build Coastguard Worker */
515*6a54128fSAndroid Build Coastguard Worker static int utf8agemin(const struct utf8data *data, const char *s)
516*6a54128fSAndroid Build Coastguard Worker {
517*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
518*6a54128fSAndroid Build Coastguard Worker int age;
519*6a54128fSAndroid Build Coastguard Worker int leaf_age;
520*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
521*6a54128fSAndroid Build Coastguard Worker
522*6a54128fSAndroid Build Coastguard Worker if (!data)
523*6a54128fSAndroid Build Coastguard Worker return -1;
524*6a54128fSAndroid Build Coastguard Worker age = data->maxage;
525*6a54128fSAndroid Build Coastguard Worker while (*s) {
526*6a54128fSAndroid Build Coastguard Worker leaf = utf8lookup(data, hangul, s);
527*6a54128fSAndroid Build Coastguard Worker if (!leaf)
528*6a54128fSAndroid Build Coastguard Worker return -1;
529*6a54128fSAndroid Build Coastguard Worker leaf_age = utf8agetab[LEAF_GEN(leaf)];
530*6a54128fSAndroid Build Coastguard Worker if (leaf_age <= data->maxage && leaf_age < age)
531*6a54128fSAndroid Build Coastguard Worker age = leaf_age;
532*6a54128fSAndroid Build Coastguard Worker s += utf8clen(s);
533*6a54128fSAndroid Build Coastguard Worker }
534*6a54128fSAndroid Build Coastguard Worker return age;
535*6a54128fSAndroid Build Coastguard Worker }
536*6a54128fSAndroid Build Coastguard Worker #endif
537*6a54128fSAndroid Build Coastguard Worker
538*6a54128fSAndroid Build Coastguard Worker #if 0
539*6a54128fSAndroid Build Coastguard Worker /*
540*6a54128fSAndroid Build Coastguard Worker * Maximum age of any character in s, touch at most len bytes.
541*6a54128fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
542*6a54128fSAndroid Build Coastguard Worker */
543*6a54128fSAndroid Build Coastguard Worker static int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
544*6a54128fSAndroid Build Coastguard Worker {
545*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
546*6a54128fSAndroid Build Coastguard Worker int age = 0;
547*6a54128fSAndroid Build Coastguard Worker int leaf_age;
548*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
549*6a54128fSAndroid Build Coastguard Worker
550*6a54128fSAndroid Build Coastguard Worker if (!data)
551*6a54128fSAndroid Build Coastguard Worker return -1;
552*6a54128fSAndroid Build Coastguard Worker
553*6a54128fSAndroid Build Coastguard Worker while (len && *s) {
554*6a54128fSAndroid Build Coastguard Worker leaf = utf8nlookup(data, hangul, s, len);
555*6a54128fSAndroid Build Coastguard Worker if (!leaf)
556*6a54128fSAndroid Build Coastguard Worker return -1;
557*6a54128fSAndroid Build Coastguard Worker leaf_age = utf8agetab[LEAF_GEN(leaf)];
558*6a54128fSAndroid Build Coastguard Worker if (leaf_age <= data->maxage && leaf_age > age)
559*6a54128fSAndroid Build Coastguard Worker age = leaf_age;
560*6a54128fSAndroid Build Coastguard Worker len -= utf8clen(s);
561*6a54128fSAndroid Build Coastguard Worker s += utf8clen(s);
562*6a54128fSAndroid Build Coastguard Worker }
563*6a54128fSAndroid Build Coastguard Worker return age;
564*6a54128fSAndroid Build Coastguard Worker }
565*6a54128fSAndroid Build Coastguard Worker #endif
566*6a54128fSAndroid Build Coastguard Worker
567*6a54128fSAndroid Build Coastguard Worker #if 0
568*6a54128fSAndroid Build Coastguard Worker /*
569*6a54128fSAndroid Build Coastguard Worker * Maximum age of any character in s, touch at most len bytes.
570*6a54128fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
571*6a54128fSAndroid Build Coastguard Worker */
572*6a54128fSAndroid Build Coastguard Worker static int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
573*6a54128fSAndroid Build Coastguard Worker {
574*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
575*6a54128fSAndroid Build Coastguard Worker int leaf_age;
576*6a54128fSAndroid Build Coastguard Worker int age;
577*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
578*6a54128fSAndroid Build Coastguard Worker
579*6a54128fSAndroid Build Coastguard Worker if (!data)
580*6a54128fSAndroid Build Coastguard Worker return -1;
581*6a54128fSAndroid Build Coastguard Worker age = data->maxage;
582*6a54128fSAndroid Build Coastguard Worker while (len && *s) {
583*6a54128fSAndroid Build Coastguard Worker leaf = utf8nlookup(data, hangul, s, len);
584*6a54128fSAndroid Build Coastguard Worker if (!leaf)
585*6a54128fSAndroid Build Coastguard Worker return -1;
586*6a54128fSAndroid Build Coastguard Worker leaf_age = utf8agetab[LEAF_GEN(leaf)];
587*6a54128fSAndroid Build Coastguard Worker if (leaf_age <= data->maxage && leaf_age < age)
588*6a54128fSAndroid Build Coastguard Worker age = leaf_age;
589*6a54128fSAndroid Build Coastguard Worker len -= utf8clen(s);
590*6a54128fSAndroid Build Coastguard Worker s += utf8clen(s);
591*6a54128fSAndroid Build Coastguard Worker }
592*6a54128fSAndroid Build Coastguard Worker return age;
593*6a54128fSAndroid Build Coastguard Worker }
594*6a54128fSAndroid Build Coastguard Worker #endif
595*6a54128fSAndroid Build Coastguard Worker
596*6a54128fSAndroid Build Coastguard Worker #if 0
597*6a54128fSAndroid Build Coastguard Worker /*
598*6a54128fSAndroid Build Coastguard Worker * Length of the normalization of s.
599*6a54128fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
600*6a54128fSAndroid Build Coastguard Worker *
601*6a54128fSAndroid Build Coastguard Worker * A string of Default_Ignorable_Code_Point has length 0.
602*6a54128fSAndroid Build Coastguard Worker */
603*6a54128fSAndroid Build Coastguard Worker static ssize_t utf8len(const struct utf8data *data, const char *s)
604*6a54128fSAndroid Build Coastguard Worker {
605*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
606*6a54128fSAndroid Build Coastguard Worker size_t ret = 0;
607*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
608*6a54128fSAndroid Build Coastguard Worker
609*6a54128fSAndroid Build Coastguard Worker if (!data)
610*6a54128fSAndroid Build Coastguard Worker return -1;
611*6a54128fSAndroid Build Coastguard Worker while (*s) {
612*6a54128fSAndroid Build Coastguard Worker leaf = utf8lookup(data, hangul, s);
613*6a54128fSAndroid Build Coastguard Worker if (!leaf)
614*6a54128fSAndroid Build Coastguard Worker return -1;
615*6a54128fSAndroid Build Coastguard Worker if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
616*6a54128fSAndroid Build Coastguard Worker ret += utf8clen(s);
617*6a54128fSAndroid Build Coastguard Worker else if (LEAF_CCC(leaf) == DECOMPOSE)
618*6a54128fSAndroid Build Coastguard Worker ret += strlen(LEAF_STR(leaf));
619*6a54128fSAndroid Build Coastguard Worker else
620*6a54128fSAndroid Build Coastguard Worker ret += utf8clen(s);
621*6a54128fSAndroid Build Coastguard Worker s += utf8clen(s);
622*6a54128fSAndroid Build Coastguard Worker }
623*6a54128fSAndroid Build Coastguard Worker return ret;
624*6a54128fSAndroid Build Coastguard Worker }
625*6a54128fSAndroid Build Coastguard Worker #endif
626*6a54128fSAndroid Build Coastguard Worker
627*6a54128fSAndroid Build Coastguard Worker #if 0
628*6a54128fSAndroid Build Coastguard Worker /*
629*6a54128fSAndroid Build Coastguard Worker * Length of the normalization of s, touch at most len bytes.
630*6a54128fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
631*6a54128fSAndroid Build Coastguard Worker */
632*6a54128fSAndroid Build Coastguard Worker static ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
633*6a54128fSAndroid Build Coastguard Worker {
634*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
635*6a54128fSAndroid Build Coastguard Worker size_t ret = 0;
636*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
637*6a54128fSAndroid Build Coastguard Worker
638*6a54128fSAndroid Build Coastguard Worker if (!data)
639*6a54128fSAndroid Build Coastguard Worker return -1;
640*6a54128fSAndroid Build Coastguard Worker while (len && *s) {
641*6a54128fSAndroid Build Coastguard Worker leaf = utf8nlookup(data, hangul, s, len);
642*6a54128fSAndroid Build Coastguard Worker if (!leaf)
643*6a54128fSAndroid Build Coastguard Worker return -1;
644*6a54128fSAndroid Build Coastguard Worker if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
645*6a54128fSAndroid Build Coastguard Worker ret += utf8clen(s);
646*6a54128fSAndroid Build Coastguard Worker else if (LEAF_CCC(leaf) == DECOMPOSE)
647*6a54128fSAndroid Build Coastguard Worker ret += strlen(LEAF_STR(leaf));
648*6a54128fSAndroid Build Coastguard Worker else
649*6a54128fSAndroid Build Coastguard Worker ret += utf8clen(s);
650*6a54128fSAndroid Build Coastguard Worker len -= utf8clen(s);
651*6a54128fSAndroid Build Coastguard Worker s += utf8clen(s);
652*6a54128fSAndroid Build Coastguard Worker }
653*6a54128fSAndroid Build Coastguard Worker return ret;
654*6a54128fSAndroid Build Coastguard Worker }
655*6a54128fSAndroid Build Coastguard Worker #endif
656*6a54128fSAndroid Build Coastguard Worker
657*6a54128fSAndroid Build Coastguard Worker /*
658*6a54128fSAndroid Build Coastguard Worker * Set up an utf8cursor for use by utf8byte().
659*6a54128fSAndroid Build Coastguard Worker *
660*6a54128fSAndroid Build Coastguard Worker * u8c : pointer to cursor.
661*6a54128fSAndroid Build Coastguard Worker * data : const struct utf8data to use for normalization.
662*6a54128fSAndroid Build Coastguard Worker * s : string.
663*6a54128fSAndroid Build Coastguard Worker * len : length of s.
664*6a54128fSAndroid Build Coastguard Worker *
665*6a54128fSAndroid Build Coastguard Worker * Returns -1 on error, 0 on success.
666*6a54128fSAndroid Build Coastguard Worker */
utf8ncursor(struct utf8cursor * u8c,const struct utf8data * data,const char * s,size_t len)667*6a54128fSAndroid Build Coastguard Worker static int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
668*6a54128fSAndroid Build Coastguard Worker const char *s, size_t len)
669*6a54128fSAndroid Build Coastguard Worker {
670*6a54128fSAndroid Build Coastguard Worker if (!data)
671*6a54128fSAndroid Build Coastguard Worker return -1;
672*6a54128fSAndroid Build Coastguard Worker if (!s)
673*6a54128fSAndroid Build Coastguard Worker return -1;
674*6a54128fSAndroid Build Coastguard Worker u8c->data = data;
675*6a54128fSAndroid Build Coastguard Worker u8c->s = s;
676*6a54128fSAndroid Build Coastguard Worker u8c->p = NULL;
677*6a54128fSAndroid Build Coastguard Worker u8c->ss = NULL;
678*6a54128fSAndroid Build Coastguard Worker u8c->sp = NULL;
679*6a54128fSAndroid Build Coastguard Worker u8c->len = len;
680*6a54128fSAndroid Build Coastguard Worker u8c->slen = 0;
681*6a54128fSAndroid Build Coastguard Worker u8c->ccc = STOPPER;
682*6a54128fSAndroid Build Coastguard Worker u8c->nccc = STOPPER;
683*6a54128fSAndroid Build Coastguard Worker /* Check we didn't clobber the maximum length. */
684*6a54128fSAndroid Build Coastguard Worker if (u8c->len != len)
685*6a54128fSAndroid Build Coastguard Worker return -1;
686*6a54128fSAndroid Build Coastguard Worker /* The first byte of s may not be an utf8 continuation. */
687*6a54128fSAndroid Build Coastguard Worker if (len > 0 && (*s & 0xC0) == 0x80)
688*6a54128fSAndroid Build Coastguard Worker return -1;
689*6a54128fSAndroid Build Coastguard Worker return 0;
690*6a54128fSAndroid Build Coastguard Worker }
691*6a54128fSAndroid Build Coastguard Worker
692*6a54128fSAndroid Build Coastguard Worker #if 0
693*6a54128fSAndroid Build Coastguard Worker /*
694*6a54128fSAndroid Build Coastguard Worker * Set up an utf8cursor for use by utf8byte().
695*6a54128fSAndroid Build Coastguard Worker *
696*6a54128fSAndroid Build Coastguard Worker * u8c : pointer to cursor.
697*6a54128fSAndroid Build Coastguard Worker * data : const struct utf8data to use for normalization.
698*6a54128fSAndroid Build Coastguard Worker * s : NUL-terminated string.
699*6a54128fSAndroid Build Coastguard Worker *
700*6a54128fSAndroid Build Coastguard Worker * Returns -1 on error, 0 on success.
701*6a54128fSAndroid Build Coastguard Worker */
702*6a54128fSAndroid Build Coastguard Worker static int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
703*6a54128fSAndroid Build Coastguard Worker const char *s)
704*6a54128fSAndroid Build Coastguard Worker {
705*6a54128fSAndroid Build Coastguard Worker return utf8ncursor(u8c, data, s, (unsigned int)-1);
706*6a54128fSAndroid Build Coastguard Worker }
707*6a54128fSAndroid Build Coastguard Worker #endif
708*6a54128fSAndroid Build Coastguard Worker
709*6a54128fSAndroid Build Coastguard Worker /*
710*6a54128fSAndroid Build Coastguard Worker * Get one byte from the normalized form of the string described by u8c.
711*6a54128fSAndroid Build Coastguard Worker *
712*6a54128fSAndroid Build Coastguard Worker * Returns the byte cast to an unsigned char on success, and -1 on failure.
713*6a54128fSAndroid Build Coastguard Worker *
714*6a54128fSAndroid Build Coastguard Worker * The cursor keeps track of the location in the string in u8c->s.
715*6a54128fSAndroid Build Coastguard Worker * When a character is decomposed, the current location is stored in
716*6a54128fSAndroid Build Coastguard Worker * u8c->p, and u8c->s is set to the start of the decomposition. Note
717*6a54128fSAndroid Build Coastguard Worker * that bytes from a decomposition do not count against u8c->len.
718*6a54128fSAndroid Build Coastguard Worker *
719*6a54128fSAndroid Build Coastguard Worker * Characters are emitted if they match the current CCC in u8c->ccc.
720*6a54128fSAndroid Build Coastguard Worker * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
721*6a54128fSAndroid Build Coastguard Worker * and the function returns 0 in that case.
722*6a54128fSAndroid Build Coastguard Worker *
723*6a54128fSAndroid Build Coastguard Worker * Sorting by CCC is done by repeatedly scanning the string. The
724*6a54128fSAndroid Build Coastguard Worker * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
725*6a54128fSAndroid Build Coastguard Worker * the start of the scan. The first pass finds the lowest CCC to be
726*6a54128fSAndroid Build Coastguard Worker * emitted and stores it in u8c->nccc, the second pass emits the
727*6a54128fSAndroid Build Coastguard Worker * characters with this CCC and finds the next lowest CCC. This limits
728*6a54128fSAndroid Build Coastguard Worker * the number of passes to 1 + the number of different CCCs in the
729*6a54128fSAndroid Build Coastguard Worker * sequence being scanned.
730*6a54128fSAndroid Build Coastguard Worker *
731*6a54128fSAndroid Build Coastguard Worker * Therefore:
732*6a54128fSAndroid Build Coastguard Worker * u8c->p != NULL -> a decomposition is being scanned.
733*6a54128fSAndroid Build Coastguard Worker * u8c->ss != NULL -> this is a repeating scan.
734*6a54128fSAndroid Build Coastguard Worker * u8c->ccc == -1 -> this is the first scan of a repeating scan.
735*6a54128fSAndroid Build Coastguard Worker */
utf8byte(struct utf8cursor * u8c)736*6a54128fSAndroid Build Coastguard Worker static int utf8byte(struct utf8cursor *u8c)
737*6a54128fSAndroid Build Coastguard Worker {
738*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
739*6a54128fSAndroid Build Coastguard Worker int ccc;
740*6a54128fSAndroid Build Coastguard Worker
741*6a54128fSAndroid Build Coastguard Worker for (;;) {
742*6a54128fSAndroid Build Coastguard Worker /* Check for the end of a decomposed character. */
743*6a54128fSAndroid Build Coastguard Worker if (u8c->p && *u8c->s == '\0') {
744*6a54128fSAndroid Build Coastguard Worker u8c->s = u8c->p;
745*6a54128fSAndroid Build Coastguard Worker u8c->p = NULL;
746*6a54128fSAndroid Build Coastguard Worker }
747*6a54128fSAndroid Build Coastguard Worker
748*6a54128fSAndroid Build Coastguard Worker /* Check for end-of-string. */
749*6a54128fSAndroid Build Coastguard Worker if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
750*6a54128fSAndroid Build Coastguard Worker /* There is no next byte. */
751*6a54128fSAndroid Build Coastguard Worker if (u8c->ccc == STOPPER)
752*6a54128fSAndroid Build Coastguard Worker return 0;
753*6a54128fSAndroid Build Coastguard Worker /* End-of-string during a scan counts as a stopper. */
754*6a54128fSAndroid Build Coastguard Worker ccc = STOPPER;
755*6a54128fSAndroid Build Coastguard Worker goto ccc_mismatch;
756*6a54128fSAndroid Build Coastguard Worker } else if ((*u8c->s & 0xC0) == 0x80) {
757*6a54128fSAndroid Build Coastguard Worker /* This is a continuation of the current character. */
758*6a54128fSAndroid Build Coastguard Worker if (!u8c->p)
759*6a54128fSAndroid Build Coastguard Worker u8c->len--;
760*6a54128fSAndroid Build Coastguard Worker return (unsigned char)*u8c->s++;
761*6a54128fSAndroid Build Coastguard Worker }
762*6a54128fSAndroid Build Coastguard Worker
763*6a54128fSAndroid Build Coastguard Worker /* Look up the data for the current character. */
764*6a54128fSAndroid Build Coastguard Worker if (u8c->p) {
765*6a54128fSAndroid Build Coastguard Worker leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
766*6a54128fSAndroid Build Coastguard Worker } else {
767*6a54128fSAndroid Build Coastguard Worker leaf = utf8nlookup(u8c->data, u8c->hangul,
768*6a54128fSAndroid Build Coastguard Worker u8c->s, u8c->len);
769*6a54128fSAndroid Build Coastguard Worker }
770*6a54128fSAndroid Build Coastguard Worker
771*6a54128fSAndroid Build Coastguard Worker /* No leaf found implies that the input is a binary blob. */
772*6a54128fSAndroid Build Coastguard Worker if (!leaf)
773*6a54128fSAndroid Build Coastguard Worker return -1;
774*6a54128fSAndroid Build Coastguard Worker
775*6a54128fSAndroid Build Coastguard Worker ccc = LEAF_CCC(leaf);
776*6a54128fSAndroid Build Coastguard Worker /* Characters that are too new have CCC 0. */
777*6a54128fSAndroid Build Coastguard Worker if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
778*6a54128fSAndroid Build Coastguard Worker ccc = STOPPER;
779*6a54128fSAndroid Build Coastguard Worker } else if (ccc == DECOMPOSE) {
780*6a54128fSAndroid Build Coastguard Worker u8c->len -= utf8clen(u8c->s);
781*6a54128fSAndroid Build Coastguard Worker u8c->p = u8c->s + utf8clen(u8c->s);
782*6a54128fSAndroid Build Coastguard Worker u8c->s = LEAF_STR(leaf);
783*6a54128fSAndroid Build Coastguard Worker /* Empty decomposition implies CCC 0. */
784*6a54128fSAndroid Build Coastguard Worker if (*u8c->s == '\0') {
785*6a54128fSAndroid Build Coastguard Worker if (u8c->ccc == STOPPER)
786*6a54128fSAndroid Build Coastguard Worker continue;
787*6a54128fSAndroid Build Coastguard Worker ccc = STOPPER;
788*6a54128fSAndroid Build Coastguard Worker goto ccc_mismatch;
789*6a54128fSAndroid Build Coastguard Worker }
790*6a54128fSAndroid Build Coastguard Worker
791*6a54128fSAndroid Build Coastguard Worker leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
792*6a54128fSAndroid Build Coastguard Worker if (!leaf)
793*6a54128fSAndroid Build Coastguard Worker return -1;
794*6a54128fSAndroid Build Coastguard Worker ccc = LEAF_CCC(leaf);
795*6a54128fSAndroid Build Coastguard Worker }
796*6a54128fSAndroid Build Coastguard Worker
797*6a54128fSAndroid Build Coastguard Worker /*
798*6a54128fSAndroid Build Coastguard Worker * If this is not a stopper, then see if it updates
799*6a54128fSAndroid Build Coastguard Worker * the next canonical class to be emitted.
800*6a54128fSAndroid Build Coastguard Worker */
801*6a54128fSAndroid Build Coastguard Worker if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
802*6a54128fSAndroid Build Coastguard Worker u8c->nccc = ccc;
803*6a54128fSAndroid Build Coastguard Worker
804*6a54128fSAndroid Build Coastguard Worker /*
805*6a54128fSAndroid Build Coastguard Worker * Return the current byte if this is the current
806*6a54128fSAndroid Build Coastguard Worker * combining class.
807*6a54128fSAndroid Build Coastguard Worker */
808*6a54128fSAndroid Build Coastguard Worker if (ccc == u8c->ccc) {
809*6a54128fSAndroid Build Coastguard Worker if (!u8c->p)
810*6a54128fSAndroid Build Coastguard Worker u8c->len--;
811*6a54128fSAndroid Build Coastguard Worker return (unsigned char)*u8c->s++;
812*6a54128fSAndroid Build Coastguard Worker }
813*6a54128fSAndroid Build Coastguard Worker
814*6a54128fSAndroid Build Coastguard Worker /* Current combining class mismatch. */
815*6a54128fSAndroid Build Coastguard Worker ccc_mismatch:
816*6a54128fSAndroid Build Coastguard Worker if (u8c->nccc == STOPPER) {
817*6a54128fSAndroid Build Coastguard Worker /*
818*6a54128fSAndroid Build Coastguard Worker * Scan forward for the first canonical class
819*6a54128fSAndroid Build Coastguard Worker * to be emitted. Save the position from
820*6a54128fSAndroid Build Coastguard Worker * which to restart.
821*6a54128fSAndroid Build Coastguard Worker */
822*6a54128fSAndroid Build Coastguard Worker u8c->ccc = MINCCC - 1;
823*6a54128fSAndroid Build Coastguard Worker u8c->nccc = ccc;
824*6a54128fSAndroid Build Coastguard Worker u8c->sp = u8c->p;
825*6a54128fSAndroid Build Coastguard Worker u8c->ss = u8c->s;
826*6a54128fSAndroid Build Coastguard Worker u8c->slen = u8c->len;
827*6a54128fSAndroid Build Coastguard Worker if (!u8c->p)
828*6a54128fSAndroid Build Coastguard Worker u8c->len -= utf8clen(u8c->s);
829*6a54128fSAndroid Build Coastguard Worker u8c->s += utf8clen(u8c->s);
830*6a54128fSAndroid Build Coastguard Worker } else if (ccc != STOPPER) {
831*6a54128fSAndroid Build Coastguard Worker /* Not a stopper, and not the ccc we're emitting. */
832*6a54128fSAndroid Build Coastguard Worker if (!u8c->p)
833*6a54128fSAndroid Build Coastguard Worker u8c->len -= utf8clen(u8c->s);
834*6a54128fSAndroid Build Coastguard Worker u8c->s += utf8clen(u8c->s);
835*6a54128fSAndroid Build Coastguard Worker } else if (u8c->nccc != MAXCCC + 1) {
836*6a54128fSAndroid Build Coastguard Worker /* At a stopper, restart for next ccc. */
837*6a54128fSAndroid Build Coastguard Worker u8c->ccc = u8c->nccc;
838*6a54128fSAndroid Build Coastguard Worker u8c->nccc = MAXCCC + 1;
839*6a54128fSAndroid Build Coastguard Worker u8c->s = u8c->ss;
840*6a54128fSAndroid Build Coastguard Worker u8c->p = u8c->sp;
841*6a54128fSAndroid Build Coastguard Worker u8c->len = u8c->slen;
842*6a54128fSAndroid Build Coastguard Worker } else {
843*6a54128fSAndroid Build Coastguard Worker /* All done, proceed from here. */
844*6a54128fSAndroid Build Coastguard Worker u8c->ccc = STOPPER;
845*6a54128fSAndroid Build Coastguard Worker u8c->nccc = STOPPER;
846*6a54128fSAndroid Build Coastguard Worker u8c->sp = NULL;
847*6a54128fSAndroid Build Coastguard Worker u8c->ss = NULL;
848*6a54128fSAndroid Build Coastguard Worker u8c->slen = 0;
849*6a54128fSAndroid Build Coastguard Worker }
850*6a54128fSAndroid Build Coastguard Worker }
851*6a54128fSAndroid Build Coastguard Worker }
852*6a54128fSAndroid Build Coastguard Worker
853*6a54128fSAndroid Build Coastguard Worker #if 0
854*6a54128fSAndroid Build Coastguard Worker /*
855*6a54128fSAndroid Build Coastguard Worker * Look for the correct const struct utf8data for a unicode version.
856*6a54128fSAndroid Build Coastguard Worker * Returns NULL if the version requested is too new.
857*6a54128fSAndroid Build Coastguard Worker *
858*6a54128fSAndroid Build Coastguard Worker * Two normalization forms are supported: nfdi and nfdicf.
859*6a54128fSAndroid Build Coastguard Worker *
860*6a54128fSAndroid Build Coastguard Worker * nfdi:
861*6a54128fSAndroid Build Coastguard Worker * - Apply unicode normalization form NFD.
862*6a54128fSAndroid Build Coastguard Worker * - Remove any Default_Ignorable_Code_Point.
863*6a54128fSAndroid Build Coastguard Worker *
864*6a54128fSAndroid Build Coastguard Worker * nfdicf:
865*6a54128fSAndroid Build Coastguard Worker * - Apply unicode normalization form NFD.
866*6a54128fSAndroid Build Coastguard Worker * - Remove any Default_Ignorable_Code_Point.
867*6a54128fSAndroid Build Coastguard Worker * - Apply a full casefold (C + F).
868*6a54128fSAndroid Build Coastguard Worker */
869*6a54128fSAndroid Build Coastguard Worker static const struct utf8data *utf8nfdi(unsigned int maxage)
870*6a54128fSAndroid Build Coastguard Worker {
871*6a54128fSAndroid Build Coastguard Worker int i = ARRAY_SIZE(utf8nfdidata) - 1;
872*6a54128fSAndroid Build Coastguard Worker
873*6a54128fSAndroid Build Coastguard Worker while (maxage < utf8nfdidata[i].maxage)
874*6a54128fSAndroid Build Coastguard Worker i--;
875*6a54128fSAndroid Build Coastguard Worker if (maxage > utf8nfdidata[i].maxage)
876*6a54128fSAndroid Build Coastguard Worker return NULL;
877*6a54128fSAndroid Build Coastguard Worker return &utf8nfdidata[i];
878*6a54128fSAndroid Build Coastguard Worker }
879*6a54128fSAndroid Build Coastguard Worker #endif
880*6a54128fSAndroid Build Coastguard Worker
utf8nfdicf(unsigned int maxage)881*6a54128fSAndroid Build Coastguard Worker static const struct utf8data *utf8nfdicf(unsigned int maxage)
882*6a54128fSAndroid Build Coastguard Worker {
883*6a54128fSAndroid Build Coastguard Worker int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
884*6a54128fSAndroid Build Coastguard Worker
885*6a54128fSAndroid Build Coastguard Worker while (maxage < utf8nfdicfdata[i].maxage)
886*6a54128fSAndroid Build Coastguard Worker i--;
887*6a54128fSAndroid Build Coastguard Worker if (maxage > utf8nfdicfdata[i].maxage)
888*6a54128fSAndroid Build Coastguard Worker return NULL;
889*6a54128fSAndroid Build Coastguard Worker return &utf8nfdicfdata[i];
890*6a54128fSAndroid Build Coastguard Worker }
891*6a54128fSAndroid Build Coastguard Worker
utf8_casefold(const struct ext2fs_nls_table * table,const unsigned char * str,size_t len,unsigned char * dest,size_t dlen)892*6a54128fSAndroid Build Coastguard Worker static int utf8_casefold(const struct ext2fs_nls_table *table,
893*6a54128fSAndroid Build Coastguard Worker const unsigned char *str, size_t len,
894*6a54128fSAndroid Build Coastguard Worker unsigned char *dest, size_t dlen)
895*6a54128fSAndroid Build Coastguard Worker {
896*6a54128fSAndroid Build Coastguard Worker const struct utf8data *data = utf8nfdicf(table->version);
897*6a54128fSAndroid Build Coastguard Worker struct utf8cursor cur;
898*6a54128fSAndroid Build Coastguard Worker size_t nlen = 0;
899*6a54128fSAndroid Build Coastguard Worker
900*6a54128fSAndroid Build Coastguard Worker if (utf8ncursor(&cur, data, (const char *) str, len) < 0)
901*6a54128fSAndroid Build Coastguard Worker goto invalid_seq;
902*6a54128fSAndroid Build Coastguard Worker
903*6a54128fSAndroid Build Coastguard Worker for (nlen = 0; nlen < dlen; nlen++) {
904*6a54128fSAndroid Build Coastguard Worker int c = utf8byte(&cur);
905*6a54128fSAndroid Build Coastguard Worker
906*6a54128fSAndroid Build Coastguard Worker dest[nlen] = c;
907*6a54128fSAndroid Build Coastguard Worker if (!c)
908*6a54128fSAndroid Build Coastguard Worker return nlen;
909*6a54128fSAndroid Build Coastguard Worker if (c == -1)
910*6a54128fSAndroid Build Coastguard Worker break;
911*6a54128fSAndroid Build Coastguard Worker }
912*6a54128fSAndroid Build Coastguard Worker
913*6a54128fSAndroid Build Coastguard Worker return -ENAMETOOLONG;
914*6a54128fSAndroid Build Coastguard Worker
915*6a54128fSAndroid Build Coastguard Worker invalid_seq:
916*6a54128fSAndroid Build Coastguard Worker if (dlen < len)
917*6a54128fSAndroid Build Coastguard Worker return -ENAMETOOLONG;
918*6a54128fSAndroid Build Coastguard Worker
919*6a54128fSAndroid Build Coastguard Worker /* Signal invalid sequence */
920*6a54128fSAndroid Build Coastguard Worker return -EINVAL;
921*6a54128fSAndroid Build Coastguard Worker }
922*6a54128fSAndroid Build Coastguard Worker
utf8_validate(const struct ext2fs_nls_table * table,char * s,size_t len,char ** pos)923*6a54128fSAndroid Build Coastguard Worker static int utf8_validate(const struct ext2fs_nls_table *table,
924*6a54128fSAndroid Build Coastguard Worker char *s, size_t len, char **pos)
925*6a54128fSAndroid Build Coastguard Worker {
926*6a54128fSAndroid Build Coastguard Worker const struct utf8data *data = utf8nfdicf(table->version);
927*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
928*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
929*6a54128fSAndroid Build Coastguard Worker
930*6a54128fSAndroid Build Coastguard Worker if (!data)
931*6a54128fSAndroid Build Coastguard Worker return -1;
932*6a54128fSAndroid Build Coastguard Worker while (len && *s) {
933*6a54128fSAndroid Build Coastguard Worker leaf = utf8nlookup(data, hangul, s, len);
934*6a54128fSAndroid Build Coastguard Worker if (!leaf) {
935*6a54128fSAndroid Build Coastguard Worker *pos = s;
936*6a54128fSAndroid Build Coastguard Worker return 1;
937*6a54128fSAndroid Build Coastguard Worker }
938*6a54128fSAndroid Build Coastguard Worker len -= utf8clen(s);
939*6a54128fSAndroid Build Coastguard Worker s += utf8clen(s);
940*6a54128fSAndroid Build Coastguard Worker }
941*6a54128fSAndroid Build Coastguard Worker return 0;
942*6a54128fSAndroid Build Coastguard Worker }
943*6a54128fSAndroid Build Coastguard Worker
utf8_casefold_cmp(const struct ext2fs_nls_table * table,const unsigned char * str1,size_t len1,const unsigned char * str2,size_t len2)944*6a54128fSAndroid Build Coastguard Worker static int utf8_casefold_cmp(const struct ext2fs_nls_table *table,
945*6a54128fSAndroid Build Coastguard Worker const unsigned char *str1, size_t len1,
946*6a54128fSAndroid Build Coastguard Worker const unsigned char *str2, size_t len2)
947*6a54128fSAndroid Build Coastguard Worker {
948*6a54128fSAndroid Build Coastguard Worker const struct utf8data *data = utf8nfdicf(table->version);
949*6a54128fSAndroid Build Coastguard Worker int c1, c2;
950*6a54128fSAndroid Build Coastguard Worker struct utf8cursor cur1, cur2;
951*6a54128fSAndroid Build Coastguard Worker
952*6a54128fSAndroid Build Coastguard Worker if (utf8ncursor(&cur1, data, (const char *) str1, len1) < 0)
953*6a54128fSAndroid Build Coastguard Worker return -1;
954*6a54128fSAndroid Build Coastguard Worker if (utf8ncursor(&cur2, data, (const char *) str2, len2) < 0)
955*6a54128fSAndroid Build Coastguard Worker return -1;
956*6a54128fSAndroid Build Coastguard Worker
957*6a54128fSAndroid Build Coastguard Worker do {
958*6a54128fSAndroid Build Coastguard Worker c1 = utf8byte(&cur1);
959*6a54128fSAndroid Build Coastguard Worker c2 = utf8byte(&cur2);
960*6a54128fSAndroid Build Coastguard Worker
961*6a54128fSAndroid Build Coastguard Worker if (c1 < 0 || c2 < 0)
962*6a54128fSAndroid Build Coastguard Worker return -1;
963*6a54128fSAndroid Build Coastguard Worker if (c1 != c2)
964*6a54128fSAndroid Build Coastguard Worker return c1 - c2;
965*6a54128fSAndroid Build Coastguard Worker } while (c1);
966*6a54128fSAndroid Build Coastguard Worker
967*6a54128fSAndroid Build Coastguard Worker return 0;
968*6a54128fSAndroid Build Coastguard Worker }
969*6a54128fSAndroid Build Coastguard Worker
970*6a54128fSAndroid Build Coastguard Worker static const struct ext2fs_nls_ops utf8_ops = {
971*6a54128fSAndroid Build Coastguard Worker .casefold = utf8_casefold,
972*6a54128fSAndroid Build Coastguard Worker .validate = utf8_validate,
973*6a54128fSAndroid Build Coastguard Worker .casefold_cmp = utf8_casefold_cmp,
974*6a54128fSAndroid Build Coastguard Worker };
975*6a54128fSAndroid Build Coastguard Worker
976*6a54128fSAndroid Build Coastguard Worker static const struct ext2fs_nls_table nls_utf8 = {
977*6a54128fSAndroid Build Coastguard Worker .ops = &utf8_ops,
978*6a54128fSAndroid Build Coastguard Worker .version = UNICODE_AGE(12, 1, 0),
979*6a54128fSAndroid Build Coastguard Worker };
980*6a54128fSAndroid Build Coastguard Worker
ext2fs_load_nls_table(int encoding)981*6a54128fSAndroid Build Coastguard Worker const struct ext2fs_nls_table *ext2fs_load_nls_table(int encoding)
982*6a54128fSAndroid Build Coastguard Worker {
983*6a54128fSAndroid Build Coastguard Worker if (encoding == EXT4_ENC_UTF8_12_1)
984*6a54128fSAndroid Build Coastguard Worker return &nls_utf8;
985*6a54128fSAndroid Build Coastguard Worker
986*6a54128fSAndroid Build Coastguard Worker return NULL;
987*6a54128fSAndroid Build Coastguard Worker }
988*6a54128fSAndroid Build Coastguard Worker
ext2fs_check_encoded_name(const struct ext2fs_nls_table * table,char * name,size_t len,char ** pos)989*6a54128fSAndroid Build Coastguard Worker int ext2fs_check_encoded_name(const struct ext2fs_nls_table *table,
990*6a54128fSAndroid Build Coastguard Worker char *name, size_t len, char **pos)
991*6a54128fSAndroid Build Coastguard Worker {
992*6a54128fSAndroid Build Coastguard Worker return table->ops->validate(table, name, len, pos);
993*6a54128fSAndroid Build Coastguard Worker }
994*6a54128fSAndroid Build Coastguard Worker
ext2fs_casefold_cmp(const struct ext2fs_nls_table * table,const unsigned char * str1,size_t len1,const unsigned char * str2,size_t len2)995*6a54128fSAndroid Build Coastguard Worker int ext2fs_casefold_cmp(const struct ext2fs_nls_table *table,
996*6a54128fSAndroid Build Coastguard Worker const unsigned char *str1, size_t len1,
997*6a54128fSAndroid Build Coastguard Worker const unsigned char *str2, size_t len2)
998*6a54128fSAndroid Build Coastguard Worker {
999*6a54128fSAndroid Build Coastguard Worker return table->ops->casefold_cmp(table, str1, len1, str2, len2);
1000*6a54128fSAndroid Build Coastguard Worker }
1001