1*ccdc9c3eSSadaf Ebrahimi /*
2*ccdc9c3eSSadaf Ebrahimi * The authors of this software are Rob Pike and Ken Thompson.
3*ccdc9c3eSSadaf Ebrahimi * Copyright (c) 2002 by Lucent Technologies.
4*ccdc9c3eSSadaf Ebrahimi * Permission to use, copy, modify, and distribute this software for any
5*ccdc9c3eSSadaf Ebrahimi * purpose without fee is hereby granted, provided that this entire notice
6*ccdc9c3eSSadaf Ebrahimi * is included in all copies of any software which is or includes a copy
7*ccdc9c3eSSadaf Ebrahimi * or modification of this software and in all copies of the supporting
8*ccdc9c3eSSadaf Ebrahimi * documentation for such software.
9*ccdc9c3eSSadaf Ebrahimi * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10*ccdc9c3eSSadaf Ebrahimi * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11*ccdc9c3eSSadaf Ebrahimi * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12*ccdc9c3eSSadaf Ebrahimi * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13*ccdc9c3eSSadaf Ebrahimi */
14*ccdc9c3eSSadaf Ebrahimi
15*ccdc9c3eSSadaf Ebrahimi #include <stdarg.h>
16*ccdc9c3eSSadaf Ebrahimi #include <string.h>
17*ccdc9c3eSSadaf Ebrahimi
18*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h"
19*ccdc9c3eSSadaf Ebrahimi
20*ccdc9c3eSSadaf Ebrahimi namespace re2 {
21*ccdc9c3eSSadaf Ebrahimi
22*ccdc9c3eSSadaf Ebrahimi enum
23*ccdc9c3eSSadaf Ebrahimi {
24*ccdc9c3eSSadaf Ebrahimi Bit1 = 7,
25*ccdc9c3eSSadaf Ebrahimi Bitx = 6,
26*ccdc9c3eSSadaf Ebrahimi Bit2 = 5,
27*ccdc9c3eSSadaf Ebrahimi Bit3 = 4,
28*ccdc9c3eSSadaf Ebrahimi Bit4 = 3,
29*ccdc9c3eSSadaf Ebrahimi Bit5 = 2,
30*ccdc9c3eSSadaf Ebrahimi
31*ccdc9c3eSSadaf Ebrahimi T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
32*ccdc9c3eSSadaf Ebrahimi Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
33*ccdc9c3eSSadaf Ebrahimi T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
34*ccdc9c3eSSadaf Ebrahimi T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
35*ccdc9c3eSSadaf Ebrahimi T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
36*ccdc9c3eSSadaf Ebrahimi T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
37*ccdc9c3eSSadaf Ebrahimi
38*ccdc9c3eSSadaf Ebrahimi Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
39*ccdc9c3eSSadaf Ebrahimi Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
40*ccdc9c3eSSadaf Ebrahimi Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
41*ccdc9c3eSSadaf Ebrahimi Rune4 = (1<<(Bit4+3*Bitx))-1,
42*ccdc9c3eSSadaf Ebrahimi /* 0001 1111 1111 1111 1111 1111 */
43*ccdc9c3eSSadaf Ebrahimi
44*ccdc9c3eSSadaf Ebrahimi Maskx = (1<<Bitx)-1, /* 0011 1111 */
45*ccdc9c3eSSadaf Ebrahimi Testx = Maskx ^ 0xFF, /* 1100 0000 */
46*ccdc9c3eSSadaf Ebrahimi
47*ccdc9c3eSSadaf Ebrahimi Bad = Runeerror,
48*ccdc9c3eSSadaf Ebrahimi };
49*ccdc9c3eSSadaf Ebrahimi
50*ccdc9c3eSSadaf Ebrahimi int
chartorune(Rune * rune,const char * str)51*ccdc9c3eSSadaf Ebrahimi chartorune(Rune *rune, const char *str)
52*ccdc9c3eSSadaf Ebrahimi {
53*ccdc9c3eSSadaf Ebrahimi int c, c1, c2, c3;
54*ccdc9c3eSSadaf Ebrahimi long l;
55*ccdc9c3eSSadaf Ebrahimi
56*ccdc9c3eSSadaf Ebrahimi /*
57*ccdc9c3eSSadaf Ebrahimi * one character sequence
58*ccdc9c3eSSadaf Ebrahimi * 00000-0007F => T1
59*ccdc9c3eSSadaf Ebrahimi */
60*ccdc9c3eSSadaf Ebrahimi c = *(unsigned char*)str;
61*ccdc9c3eSSadaf Ebrahimi if(c < Tx) {
62*ccdc9c3eSSadaf Ebrahimi *rune = c;
63*ccdc9c3eSSadaf Ebrahimi return 1;
64*ccdc9c3eSSadaf Ebrahimi }
65*ccdc9c3eSSadaf Ebrahimi
66*ccdc9c3eSSadaf Ebrahimi /*
67*ccdc9c3eSSadaf Ebrahimi * two character sequence
68*ccdc9c3eSSadaf Ebrahimi * 0080-07FF => T2 Tx
69*ccdc9c3eSSadaf Ebrahimi */
70*ccdc9c3eSSadaf Ebrahimi c1 = *(unsigned char*)(str+1) ^ Tx;
71*ccdc9c3eSSadaf Ebrahimi if(c1 & Testx)
72*ccdc9c3eSSadaf Ebrahimi goto bad;
73*ccdc9c3eSSadaf Ebrahimi if(c < T3) {
74*ccdc9c3eSSadaf Ebrahimi if(c < T2)
75*ccdc9c3eSSadaf Ebrahimi goto bad;
76*ccdc9c3eSSadaf Ebrahimi l = ((c << Bitx) | c1) & Rune2;
77*ccdc9c3eSSadaf Ebrahimi if(l <= Rune1)
78*ccdc9c3eSSadaf Ebrahimi goto bad;
79*ccdc9c3eSSadaf Ebrahimi *rune = l;
80*ccdc9c3eSSadaf Ebrahimi return 2;
81*ccdc9c3eSSadaf Ebrahimi }
82*ccdc9c3eSSadaf Ebrahimi
83*ccdc9c3eSSadaf Ebrahimi /*
84*ccdc9c3eSSadaf Ebrahimi * three character sequence
85*ccdc9c3eSSadaf Ebrahimi * 0800-FFFF => T3 Tx Tx
86*ccdc9c3eSSadaf Ebrahimi */
87*ccdc9c3eSSadaf Ebrahimi c2 = *(unsigned char*)(str+2) ^ Tx;
88*ccdc9c3eSSadaf Ebrahimi if(c2 & Testx)
89*ccdc9c3eSSadaf Ebrahimi goto bad;
90*ccdc9c3eSSadaf Ebrahimi if(c < T4) {
91*ccdc9c3eSSadaf Ebrahimi l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
92*ccdc9c3eSSadaf Ebrahimi if(l <= Rune2)
93*ccdc9c3eSSadaf Ebrahimi goto bad;
94*ccdc9c3eSSadaf Ebrahimi *rune = l;
95*ccdc9c3eSSadaf Ebrahimi return 3;
96*ccdc9c3eSSadaf Ebrahimi }
97*ccdc9c3eSSadaf Ebrahimi
98*ccdc9c3eSSadaf Ebrahimi /*
99*ccdc9c3eSSadaf Ebrahimi * four character sequence (21-bit value)
100*ccdc9c3eSSadaf Ebrahimi * 10000-1FFFFF => T4 Tx Tx Tx
101*ccdc9c3eSSadaf Ebrahimi */
102*ccdc9c3eSSadaf Ebrahimi c3 = *(unsigned char*)(str+3) ^ Tx;
103*ccdc9c3eSSadaf Ebrahimi if (c3 & Testx)
104*ccdc9c3eSSadaf Ebrahimi goto bad;
105*ccdc9c3eSSadaf Ebrahimi if (c < T5) {
106*ccdc9c3eSSadaf Ebrahimi l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
107*ccdc9c3eSSadaf Ebrahimi if (l <= Rune3)
108*ccdc9c3eSSadaf Ebrahimi goto bad;
109*ccdc9c3eSSadaf Ebrahimi *rune = l;
110*ccdc9c3eSSadaf Ebrahimi return 4;
111*ccdc9c3eSSadaf Ebrahimi }
112*ccdc9c3eSSadaf Ebrahimi
113*ccdc9c3eSSadaf Ebrahimi /*
114*ccdc9c3eSSadaf Ebrahimi * Support for 5-byte or longer UTF-8 would go here, but
115*ccdc9c3eSSadaf Ebrahimi * since we don't have that, we'll just fall through to bad.
116*ccdc9c3eSSadaf Ebrahimi */
117*ccdc9c3eSSadaf Ebrahimi
118*ccdc9c3eSSadaf Ebrahimi /*
119*ccdc9c3eSSadaf Ebrahimi * bad decoding
120*ccdc9c3eSSadaf Ebrahimi */
121*ccdc9c3eSSadaf Ebrahimi bad:
122*ccdc9c3eSSadaf Ebrahimi *rune = Bad;
123*ccdc9c3eSSadaf Ebrahimi return 1;
124*ccdc9c3eSSadaf Ebrahimi }
125*ccdc9c3eSSadaf Ebrahimi
126*ccdc9c3eSSadaf Ebrahimi int
runetochar(char * str,const Rune * rune)127*ccdc9c3eSSadaf Ebrahimi runetochar(char *str, const Rune *rune)
128*ccdc9c3eSSadaf Ebrahimi {
129*ccdc9c3eSSadaf Ebrahimi /* Runes are signed, so convert to unsigned for range check. */
130*ccdc9c3eSSadaf Ebrahimi unsigned long c;
131*ccdc9c3eSSadaf Ebrahimi
132*ccdc9c3eSSadaf Ebrahimi /*
133*ccdc9c3eSSadaf Ebrahimi * one character sequence
134*ccdc9c3eSSadaf Ebrahimi * 00000-0007F => 00-7F
135*ccdc9c3eSSadaf Ebrahimi */
136*ccdc9c3eSSadaf Ebrahimi c = *rune;
137*ccdc9c3eSSadaf Ebrahimi if(c <= Rune1) {
138*ccdc9c3eSSadaf Ebrahimi str[0] = static_cast<char>(c);
139*ccdc9c3eSSadaf Ebrahimi return 1;
140*ccdc9c3eSSadaf Ebrahimi }
141*ccdc9c3eSSadaf Ebrahimi
142*ccdc9c3eSSadaf Ebrahimi /*
143*ccdc9c3eSSadaf Ebrahimi * two character sequence
144*ccdc9c3eSSadaf Ebrahimi * 0080-07FF => T2 Tx
145*ccdc9c3eSSadaf Ebrahimi */
146*ccdc9c3eSSadaf Ebrahimi if(c <= Rune2) {
147*ccdc9c3eSSadaf Ebrahimi str[0] = T2 | static_cast<char>(c >> 1*Bitx);
148*ccdc9c3eSSadaf Ebrahimi str[1] = Tx | (c & Maskx);
149*ccdc9c3eSSadaf Ebrahimi return 2;
150*ccdc9c3eSSadaf Ebrahimi }
151*ccdc9c3eSSadaf Ebrahimi
152*ccdc9c3eSSadaf Ebrahimi /*
153*ccdc9c3eSSadaf Ebrahimi * If the Rune is out of range, convert it to the error rune.
154*ccdc9c3eSSadaf Ebrahimi * Do this test here because the error rune encodes to three bytes.
155*ccdc9c3eSSadaf Ebrahimi * Doing it earlier would duplicate work, since an out of range
156*ccdc9c3eSSadaf Ebrahimi * Rune wouldn't have fit in one or two bytes.
157*ccdc9c3eSSadaf Ebrahimi */
158*ccdc9c3eSSadaf Ebrahimi if (c > Runemax)
159*ccdc9c3eSSadaf Ebrahimi c = Runeerror;
160*ccdc9c3eSSadaf Ebrahimi
161*ccdc9c3eSSadaf Ebrahimi /*
162*ccdc9c3eSSadaf Ebrahimi * three character sequence
163*ccdc9c3eSSadaf Ebrahimi * 0800-FFFF => T3 Tx Tx
164*ccdc9c3eSSadaf Ebrahimi */
165*ccdc9c3eSSadaf Ebrahimi if (c <= Rune3) {
166*ccdc9c3eSSadaf Ebrahimi str[0] = T3 | static_cast<char>(c >> 2*Bitx);
167*ccdc9c3eSSadaf Ebrahimi str[1] = Tx | ((c >> 1*Bitx) & Maskx);
168*ccdc9c3eSSadaf Ebrahimi str[2] = Tx | (c & Maskx);
169*ccdc9c3eSSadaf Ebrahimi return 3;
170*ccdc9c3eSSadaf Ebrahimi }
171*ccdc9c3eSSadaf Ebrahimi
172*ccdc9c3eSSadaf Ebrahimi /*
173*ccdc9c3eSSadaf Ebrahimi * four character sequence (21-bit value)
174*ccdc9c3eSSadaf Ebrahimi * 10000-1FFFFF => T4 Tx Tx Tx
175*ccdc9c3eSSadaf Ebrahimi */
176*ccdc9c3eSSadaf Ebrahimi str[0] = T4 | static_cast<char>(c >> 3*Bitx);
177*ccdc9c3eSSadaf Ebrahimi str[1] = Tx | ((c >> 2*Bitx) & Maskx);
178*ccdc9c3eSSadaf Ebrahimi str[2] = Tx | ((c >> 1*Bitx) & Maskx);
179*ccdc9c3eSSadaf Ebrahimi str[3] = Tx | (c & Maskx);
180*ccdc9c3eSSadaf Ebrahimi return 4;
181*ccdc9c3eSSadaf Ebrahimi }
182*ccdc9c3eSSadaf Ebrahimi
183*ccdc9c3eSSadaf Ebrahimi int
runelen(Rune rune)184*ccdc9c3eSSadaf Ebrahimi runelen(Rune rune)
185*ccdc9c3eSSadaf Ebrahimi {
186*ccdc9c3eSSadaf Ebrahimi char str[10];
187*ccdc9c3eSSadaf Ebrahimi
188*ccdc9c3eSSadaf Ebrahimi return runetochar(str, &rune);
189*ccdc9c3eSSadaf Ebrahimi }
190*ccdc9c3eSSadaf Ebrahimi
191*ccdc9c3eSSadaf Ebrahimi int
fullrune(const char * str,int n)192*ccdc9c3eSSadaf Ebrahimi fullrune(const char *str, int n)
193*ccdc9c3eSSadaf Ebrahimi {
194*ccdc9c3eSSadaf Ebrahimi if (n > 0) {
195*ccdc9c3eSSadaf Ebrahimi int c = *(unsigned char*)str;
196*ccdc9c3eSSadaf Ebrahimi if (c < Tx)
197*ccdc9c3eSSadaf Ebrahimi return 1;
198*ccdc9c3eSSadaf Ebrahimi if (n > 1) {
199*ccdc9c3eSSadaf Ebrahimi if (c < T3)
200*ccdc9c3eSSadaf Ebrahimi return 1;
201*ccdc9c3eSSadaf Ebrahimi if (n > 2) {
202*ccdc9c3eSSadaf Ebrahimi if (c < T4 || n > 3)
203*ccdc9c3eSSadaf Ebrahimi return 1;
204*ccdc9c3eSSadaf Ebrahimi }
205*ccdc9c3eSSadaf Ebrahimi }
206*ccdc9c3eSSadaf Ebrahimi }
207*ccdc9c3eSSadaf Ebrahimi return 0;
208*ccdc9c3eSSadaf Ebrahimi }
209*ccdc9c3eSSadaf Ebrahimi
210*ccdc9c3eSSadaf Ebrahimi
211*ccdc9c3eSSadaf Ebrahimi int
utflen(const char * s)212*ccdc9c3eSSadaf Ebrahimi utflen(const char *s)
213*ccdc9c3eSSadaf Ebrahimi {
214*ccdc9c3eSSadaf Ebrahimi int c;
215*ccdc9c3eSSadaf Ebrahimi long n;
216*ccdc9c3eSSadaf Ebrahimi Rune rune;
217*ccdc9c3eSSadaf Ebrahimi
218*ccdc9c3eSSadaf Ebrahimi n = 0;
219*ccdc9c3eSSadaf Ebrahimi for(;;) {
220*ccdc9c3eSSadaf Ebrahimi c = *(unsigned char*)s;
221*ccdc9c3eSSadaf Ebrahimi if(c < Runeself) {
222*ccdc9c3eSSadaf Ebrahimi if(c == 0)
223*ccdc9c3eSSadaf Ebrahimi return n;
224*ccdc9c3eSSadaf Ebrahimi s++;
225*ccdc9c3eSSadaf Ebrahimi } else
226*ccdc9c3eSSadaf Ebrahimi s += chartorune(&rune, s);
227*ccdc9c3eSSadaf Ebrahimi n++;
228*ccdc9c3eSSadaf Ebrahimi }
229*ccdc9c3eSSadaf Ebrahimi return 0;
230*ccdc9c3eSSadaf Ebrahimi }
231*ccdc9c3eSSadaf Ebrahimi
232*ccdc9c3eSSadaf Ebrahimi char*
utfrune(const char * s,Rune c)233*ccdc9c3eSSadaf Ebrahimi utfrune(const char *s, Rune c)
234*ccdc9c3eSSadaf Ebrahimi {
235*ccdc9c3eSSadaf Ebrahimi long c1;
236*ccdc9c3eSSadaf Ebrahimi Rune r;
237*ccdc9c3eSSadaf Ebrahimi int n;
238*ccdc9c3eSSadaf Ebrahimi
239*ccdc9c3eSSadaf Ebrahimi if(c < Runesync) /* not part of utf sequence */
240*ccdc9c3eSSadaf Ebrahimi return strchr((char*)s, c);
241*ccdc9c3eSSadaf Ebrahimi
242*ccdc9c3eSSadaf Ebrahimi for(;;) {
243*ccdc9c3eSSadaf Ebrahimi c1 = *(unsigned char*)s;
244*ccdc9c3eSSadaf Ebrahimi if(c1 < Runeself) { /* one byte rune */
245*ccdc9c3eSSadaf Ebrahimi if(c1 == 0)
246*ccdc9c3eSSadaf Ebrahimi return 0;
247*ccdc9c3eSSadaf Ebrahimi if(c1 == c)
248*ccdc9c3eSSadaf Ebrahimi return (char*)s;
249*ccdc9c3eSSadaf Ebrahimi s++;
250*ccdc9c3eSSadaf Ebrahimi continue;
251*ccdc9c3eSSadaf Ebrahimi }
252*ccdc9c3eSSadaf Ebrahimi n = chartorune(&r, s);
253*ccdc9c3eSSadaf Ebrahimi if(r == c)
254*ccdc9c3eSSadaf Ebrahimi return (char*)s;
255*ccdc9c3eSSadaf Ebrahimi s += n;
256*ccdc9c3eSSadaf Ebrahimi }
257*ccdc9c3eSSadaf Ebrahimi return 0;
258*ccdc9c3eSSadaf Ebrahimi }
259*ccdc9c3eSSadaf Ebrahimi
260*ccdc9c3eSSadaf Ebrahimi } // namespace re2
261