xref: /aosp_15_r20/external/regex-re2/util/rune.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi /*
2*ccdc9c3eSSadaf Ebrahimi  * The authors of this software are Rob Pike and Ken Thompson.
3*ccdc9c3eSSadaf Ebrahimi  *              Copyright (c) 2002 by Lucent Technologies.
4*ccdc9c3eSSadaf Ebrahimi  * Permission to use, copy, modify, and distribute this software for any
5*ccdc9c3eSSadaf Ebrahimi  * purpose without fee is hereby granted, provided that this entire notice
6*ccdc9c3eSSadaf Ebrahimi  * is included in all copies of any software which is or includes a copy
7*ccdc9c3eSSadaf Ebrahimi  * or modification of this software and in all copies of the supporting
8*ccdc9c3eSSadaf Ebrahimi  * documentation for such software.
9*ccdc9c3eSSadaf Ebrahimi  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10*ccdc9c3eSSadaf Ebrahimi  * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11*ccdc9c3eSSadaf Ebrahimi  * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12*ccdc9c3eSSadaf Ebrahimi  * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13*ccdc9c3eSSadaf Ebrahimi  */
14*ccdc9c3eSSadaf Ebrahimi 
15*ccdc9c3eSSadaf Ebrahimi #include <stdarg.h>
16*ccdc9c3eSSadaf Ebrahimi #include <string.h>
17*ccdc9c3eSSadaf Ebrahimi 
18*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h"
19*ccdc9c3eSSadaf Ebrahimi 
20*ccdc9c3eSSadaf Ebrahimi namespace re2 {
21*ccdc9c3eSSadaf Ebrahimi 
22*ccdc9c3eSSadaf Ebrahimi enum
23*ccdc9c3eSSadaf Ebrahimi {
24*ccdc9c3eSSadaf Ebrahimi 	Bit1	= 7,
25*ccdc9c3eSSadaf Ebrahimi 	Bitx	= 6,
26*ccdc9c3eSSadaf Ebrahimi 	Bit2	= 5,
27*ccdc9c3eSSadaf Ebrahimi 	Bit3	= 4,
28*ccdc9c3eSSadaf Ebrahimi 	Bit4	= 3,
29*ccdc9c3eSSadaf Ebrahimi 	Bit5	= 2,
30*ccdc9c3eSSadaf Ebrahimi 
31*ccdc9c3eSSadaf Ebrahimi 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
32*ccdc9c3eSSadaf Ebrahimi 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
33*ccdc9c3eSSadaf Ebrahimi 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
34*ccdc9c3eSSadaf Ebrahimi 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
35*ccdc9c3eSSadaf Ebrahimi 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
36*ccdc9c3eSSadaf Ebrahimi 	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
37*ccdc9c3eSSadaf Ebrahimi 
38*ccdc9c3eSSadaf Ebrahimi 	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
39*ccdc9c3eSSadaf Ebrahimi 	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
40*ccdc9c3eSSadaf Ebrahimi 	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
41*ccdc9c3eSSadaf Ebrahimi 	Rune4	= (1<<(Bit4+3*Bitx))-1,
42*ccdc9c3eSSadaf Ebrahimi                                         /* 0001 1111 1111 1111 1111 1111 */
43*ccdc9c3eSSadaf Ebrahimi 
44*ccdc9c3eSSadaf Ebrahimi 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
45*ccdc9c3eSSadaf Ebrahimi 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
46*ccdc9c3eSSadaf Ebrahimi 
47*ccdc9c3eSSadaf Ebrahimi 	Bad	= Runeerror,
48*ccdc9c3eSSadaf Ebrahimi };
49*ccdc9c3eSSadaf Ebrahimi 
50*ccdc9c3eSSadaf Ebrahimi int
chartorune(Rune * rune,const char * str)51*ccdc9c3eSSadaf Ebrahimi chartorune(Rune *rune, const char *str)
52*ccdc9c3eSSadaf Ebrahimi {
53*ccdc9c3eSSadaf Ebrahimi 	int c, c1, c2, c3;
54*ccdc9c3eSSadaf Ebrahimi 	long l;
55*ccdc9c3eSSadaf Ebrahimi 
56*ccdc9c3eSSadaf Ebrahimi 	/*
57*ccdc9c3eSSadaf Ebrahimi 	 * one character sequence
58*ccdc9c3eSSadaf Ebrahimi 	 *	00000-0007F => T1
59*ccdc9c3eSSadaf Ebrahimi 	 */
60*ccdc9c3eSSadaf Ebrahimi 	c = *(unsigned char*)str;
61*ccdc9c3eSSadaf Ebrahimi 	if(c < Tx) {
62*ccdc9c3eSSadaf Ebrahimi 		*rune = c;
63*ccdc9c3eSSadaf Ebrahimi 		return 1;
64*ccdc9c3eSSadaf Ebrahimi 	}
65*ccdc9c3eSSadaf Ebrahimi 
66*ccdc9c3eSSadaf Ebrahimi 	/*
67*ccdc9c3eSSadaf Ebrahimi 	 * two character sequence
68*ccdc9c3eSSadaf Ebrahimi 	 *	0080-07FF => T2 Tx
69*ccdc9c3eSSadaf Ebrahimi 	 */
70*ccdc9c3eSSadaf Ebrahimi 	c1 = *(unsigned char*)(str+1) ^ Tx;
71*ccdc9c3eSSadaf Ebrahimi 	if(c1 & Testx)
72*ccdc9c3eSSadaf Ebrahimi 		goto bad;
73*ccdc9c3eSSadaf Ebrahimi 	if(c < T3) {
74*ccdc9c3eSSadaf Ebrahimi 		if(c < T2)
75*ccdc9c3eSSadaf Ebrahimi 			goto bad;
76*ccdc9c3eSSadaf Ebrahimi 		l = ((c << Bitx) | c1) & Rune2;
77*ccdc9c3eSSadaf Ebrahimi 		if(l <= Rune1)
78*ccdc9c3eSSadaf Ebrahimi 			goto bad;
79*ccdc9c3eSSadaf Ebrahimi 		*rune = l;
80*ccdc9c3eSSadaf Ebrahimi 		return 2;
81*ccdc9c3eSSadaf Ebrahimi 	}
82*ccdc9c3eSSadaf Ebrahimi 
83*ccdc9c3eSSadaf Ebrahimi 	/*
84*ccdc9c3eSSadaf Ebrahimi 	 * three character sequence
85*ccdc9c3eSSadaf Ebrahimi 	 *	0800-FFFF => T3 Tx Tx
86*ccdc9c3eSSadaf Ebrahimi 	 */
87*ccdc9c3eSSadaf Ebrahimi 	c2 = *(unsigned char*)(str+2) ^ Tx;
88*ccdc9c3eSSadaf Ebrahimi 	if(c2 & Testx)
89*ccdc9c3eSSadaf Ebrahimi 		goto bad;
90*ccdc9c3eSSadaf Ebrahimi 	if(c < T4) {
91*ccdc9c3eSSadaf Ebrahimi 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
92*ccdc9c3eSSadaf Ebrahimi 		if(l <= Rune2)
93*ccdc9c3eSSadaf Ebrahimi 			goto bad;
94*ccdc9c3eSSadaf Ebrahimi 		*rune = l;
95*ccdc9c3eSSadaf Ebrahimi 		return 3;
96*ccdc9c3eSSadaf Ebrahimi 	}
97*ccdc9c3eSSadaf Ebrahimi 
98*ccdc9c3eSSadaf Ebrahimi 	/*
99*ccdc9c3eSSadaf Ebrahimi 	 * four character sequence (21-bit value)
100*ccdc9c3eSSadaf Ebrahimi 	 *	10000-1FFFFF => T4 Tx Tx Tx
101*ccdc9c3eSSadaf Ebrahimi 	 */
102*ccdc9c3eSSadaf Ebrahimi 	c3 = *(unsigned char*)(str+3) ^ Tx;
103*ccdc9c3eSSadaf Ebrahimi 	if (c3 & Testx)
104*ccdc9c3eSSadaf Ebrahimi 		goto bad;
105*ccdc9c3eSSadaf Ebrahimi 	if (c < T5) {
106*ccdc9c3eSSadaf Ebrahimi 		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
107*ccdc9c3eSSadaf Ebrahimi 		if (l <= Rune3)
108*ccdc9c3eSSadaf Ebrahimi 			goto bad;
109*ccdc9c3eSSadaf Ebrahimi 		*rune = l;
110*ccdc9c3eSSadaf Ebrahimi 		return 4;
111*ccdc9c3eSSadaf Ebrahimi 	}
112*ccdc9c3eSSadaf Ebrahimi 
113*ccdc9c3eSSadaf Ebrahimi 	/*
114*ccdc9c3eSSadaf Ebrahimi 	 * Support for 5-byte or longer UTF-8 would go here, but
115*ccdc9c3eSSadaf Ebrahimi 	 * since we don't have that, we'll just fall through to bad.
116*ccdc9c3eSSadaf Ebrahimi 	 */
117*ccdc9c3eSSadaf Ebrahimi 
118*ccdc9c3eSSadaf Ebrahimi 	/*
119*ccdc9c3eSSadaf Ebrahimi 	 * bad decoding
120*ccdc9c3eSSadaf Ebrahimi 	 */
121*ccdc9c3eSSadaf Ebrahimi bad:
122*ccdc9c3eSSadaf Ebrahimi 	*rune = Bad;
123*ccdc9c3eSSadaf Ebrahimi 	return 1;
124*ccdc9c3eSSadaf Ebrahimi }
125*ccdc9c3eSSadaf Ebrahimi 
126*ccdc9c3eSSadaf Ebrahimi int
runetochar(char * str,const Rune * rune)127*ccdc9c3eSSadaf Ebrahimi runetochar(char *str, const Rune *rune)
128*ccdc9c3eSSadaf Ebrahimi {
129*ccdc9c3eSSadaf Ebrahimi 	/* Runes are signed, so convert to unsigned for range check. */
130*ccdc9c3eSSadaf Ebrahimi 	unsigned long c;
131*ccdc9c3eSSadaf Ebrahimi 
132*ccdc9c3eSSadaf Ebrahimi 	/*
133*ccdc9c3eSSadaf Ebrahimi 	 * one character sequence
134*ccdc9c3eSSadaf Ebrahimi 	 *	00000-0007F => 00-7F
135*ccdc9c3eSSadaf Ebrahimi 	 */
136*ccdc9c3eSSadaf Ebrahimi 	c = *rune;
137*ccdc9c3eSSadaf Ebrahimi 	if(c <= Rune1) {
138*ccdc9c3eSSadaf Ebrahimi 		str[0] = static_cast<char>(c);
139*ccdc9c3eSSadaf Ebrahimi 		return 1;
140*ccdc9c3eSSadaf Ebrahimi 	}
141*ccdc9c3eSSadaf Ebrahimi 
142*ccdc9c3eSSadaf Ebrahimi 	/*
143*ccdc9c3eSSadaf Ebrahimi 	 * two character sequence
144*ccdc9c3eSSadaf Ebrahimi 	 *	0080-07FF => T2 Tx
145*ccdc9c3eSSadaf Ebrahimi 	 */
146*ccdc9c3eSSadaf Ebrahimi 	if(c <= Rune2) {
147*ccdc9c3eSSadaf Ebrahimi 		str[0] = T2 | static_cast<char>(c >> 1*Bitx);
148*ccdc9c3eSSadaf Ebrahimi 		str[1] = Tx | (c & Maskx);
149*ccdc9c3eSSadaf Ebrahimi 		return 2;
150*ccdc9c3eSSadaf Ebrahimi 	}
151*ccdc9c3eSSadaf Ebrahimi 
152*ccdc9c3eSSadaf Ebrahimi 	/*
153*ccdc9c3eSSadaf Ebrahimi 	 * If the Rune is out of range, convert it to the error rune.
154*ccdc9c3eSSadaf Ebrahimi 	 * Do this test here because the error rune encodes to three bytes.
155*ccdc9c3eSSadaf Ebrahimi 	 * Doing it earlier would duplicate work, since an out of range
156*ccdc9c3eSSadaf Ebrahimi 	 * Rune wouldn't have fit in one or two bytes.
157*ccdc9c3eSSadaf Ebrahimi 	 */
158*ccdc9c3eSSadaf Ebrahimi 	if (c > Runemax)
159*ccdc9c3eSSadaf Ebrahimi 		c = Runeerror;
160*ccdc9c3eSSadaf Ebrahimi 
161*ccdc9c3eSSadaf Ebrahimi 	/*
162*ccdc9c3eSSadaf Ebrahimi 	 * three character sequence
163*ccdc9c3eSSadaf Ebrahimi 	 *	0800-FFFF => T3 Tx Tx
164*ccdc9c3eSSadaf Ebrahimi 	 */
165*ccdc9c3eSSadaf Ebrahimi 	if (c <= Rune3) {
166*ccdc9c3eSSadaf Ebrahimi 		str[0] = T3 | static_cast<char>(c >> 2*Bitx);
167*ccdc9c3eSSadaf Ebrahimi 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
168*ccdc9c3eSSadaf Ebrahimi 		str[2] = Tx | (c & Maskx);
169*ccdc9c3eSSadaf Ebrahimi 		return 3;
170*ccdc9c3eSSadaf Ebrahimi 	}
171*ccdc9c3eSSadaf Ebrahimi 
172*ccdc9c3eSSadaf Ebrahimi 	/*
173*ccdc9c3eSSadaf Ebrahimi 	 * four character sequence (21-bit value)
174*ccdc9c3eSSadaf Ebrahimi 	 *     10000-1FFFFF => T4 Tx Tx Tx
175*ccdc9c3eSSadaf Ebrahimi 	 */
176*ccdc9c3eSSadaf Ebrahimi 	str[0] = T4 | static_cast<char>(c >> 3*Bitx);
177*ccdc9c3eSSadaf Ebrahimi 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
178*ccdc9c3eSSadaf Ebrahimi 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
179*ccdc9c3eSSadaf Ebrahimi 	str[3] = Tx | (c & Maskx);
180*ccdc9c3eSSadaf Ebrahimi 	return 4;
181*ccdc9c3eSSadaf Ebrahimi }
182*ccdc9c3eSSadaf Ebrahimi 
183*ccdc9c3eSSadaf Ebrahimi int
runelen(Rune rune)184*ccdc9c3eSSadaf Ebrahimi runelen(Rune rune)
185*ccdc9c3eSSadaf Ebrahimi {
186*ccdc9c3eSSadaf Ebrahimi 	char str[10];
187*ccdc9c3eSSadaf Ebrahimi 
188*ccdc9c3eSSadaf Ebrahimi 	return runetochar(str, &rune);
189*ccdc9c3eSSadaf Ebrahimi }
190*ccdc9c3eSSadaf Ebrahimi 
191*ccdc9c3eSSadaf Ebrahimi int
fullrune(const char * str,int n)192*ccdc9c3eSSadaf Ebrahimi fullrune(const char *str, int n)
193*ccdc9c3eSSadaf Ebrahimi {
194*ccdc9c3eSSadaf Ebrahimi 	if (n > 0) {
195*ccdc9c3eSSadaf Ebrahimi 		int c = *(unsigned char*)str;
196*ccdc9c3eSSadaf Ebrahimi 		if (c < Tx)
197*ccdc9c3eSSadaf Ebrahimi 			return 1;
198*ccdc9c3eSSadaf Ebrahimi 		if (n > 1) {
199*ccdc9c3eSSadaf Ebrahimi 			if (c < T3)
200*ccdc9c3eSSadaf Ebrahimi 				return 1;
201*ccdc9c3eSSadaf Ebrahimi 			if (n > 2) {
202*ccdc9c3eSSadaf Ebrahimi 				if (c < T4 || n > 3)
203*ccdc9c3eSSadaf Ebrahimi 					return 1;
204*ccdc9c3eSSadaf Ebrahimi 			}
205*ccdc9c3eSSadaf Ebrahimi 		}
206*ccdc9c3eSSadaf Ebrahimi 	}
207*ccdc9c3eSSadaf Ebrahimi 	return 0;
208*ccdc9c3eSSadaf Ebrahimi }
209*ccdc9c3eSSadaf Ebrahimi 
210*ccdc9c3eSSadaf Ebrahimi 
211*ccdc9c3eSSadaf Ebrahimi int
utflen(const char * s)212*ccdc9c3eSSadaf Ebrahimi utflen(const char *s)
213*ccdc9c3eSSadaf Ebrahimi {
214*ccdc9c3eSSadaf Ebrahimi 	int c;
215*ccdc9c3eSSadaf Ebrahimi 	long n;
216*ccdc9c3eSSadaf Ebrahimi 	Rune rune;
217*ccdc9c3eSSadaf Ebrahimi 
218*ccdc9c3eSSadaf Ebrahimi 	n = 0;
219*ccdc9c3eSSadaf Ebrahimi 	for(;;) {
220*ccdc9c3eSSadaf Ebrahimi 		c = *(unsigned char*)s;
221*ccdc9c3eSSadaf Ebrahimi 		if(c < Runeself) {
222*ccdc9c3eSSadaf Ebrahimi 			if(c == 0)
223*ccdc9c3eSSadaf Ebrahimi 				return n;
224*ccdc9c3eSSadaf Ebrahimi 			s++;
225*ccdc9c3eSSadaf Ebrahimi 		} else
226*ccdc9c3eSSadaf Ebrahimi 			s += chartorune(&rune, s);
227*ccdc9c3eSSadaf Ebrahimi 		n++;
228*ccdc9c3eSSadaf Ebrahimi 	}
229*ccdc9c3eSSadaf Ebrahimi 	return 0;
230*ccdc9c3eSSadaf Ebrahimi }
231*ccdc9c3eSSadaf Ebrahimi 
232*ccdc9c3eSSadaf Ebrahimi char*
utfrune(const char * s,Rune c)233*ccdc9c3eSSadaf Ebrahimi utfrune(const char *s, Rune c)
234*ccdc9c3eSSadaf Ebrahimi {
235*ccdc9c3eSSadaf Ebrahimi 	long c1;
236*ccdc9c3eSSadaf Ebrahimi 	Rune r;
237*ccdc9c3eSSadaf Ebrahimi 	int n;
238*ccdc9c3eSSadaf Ebrahimi 
239*ccdc9c3eSSadaf Ebrahimi 	if(c < Runesync)		/* not part of utf sequence */
240*ccdc9c3eSSadaf Ebrahimi 		return strchr((char*)s, c);
241*ccdc9c3eSSadaf Ebrahimi 
242*ccdc9c3eSSadaf Ebrahimi 	for(;;) {
243*ccdc9c3eSSadaf Ebrahimi 		c1 = *(unsigned char*)s;
244*ccdc9c3eSSadaf Ebrahimi 		if(c1 < Runeself) {	/* one byte rune */
245*ccdc9c3eSSadaf Ebrahimi 			if(c1 == 0)
246*ccdc9c3eSSadaf Ebrahimi 				return 0;
247*ccdc9c3eSSadaf Ebrahimi 			if(c1 == c)
248*ccdc9c3eSSadaf Ebrahimi 				return (char*)s;
249*ccdc9c3eSSadaf Ebrahimi 			s++;
250*ccdc9c3eSSadaf Ebrahimi 			continue;
251*ccdc9c3eSSadaf Ebrahimi 		}
252*ccdc9c3eSSadaf Ebrahimi 		n = chartorune(&r, s);
253*ccdc9c3eSSadaf Ebrahimi 		if(r == c)
254*ccdc9c3eSSadaf Ebrahimi 			return (char*)s;
255*ccdc9c3eSSadaf Ebrahimi 		s += n;
256*ccdc9c3eSSadaf Ebrahimi 	}
257*ccdc9c3eSSadaf Ebrahimi 	return 0;
258*ccdc9c3eSSadaf Ebrahimi }
259*ccdc9c3eSSadaf Ebrahimi 
260*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
261