xref: /aosp_15_r20/external/one-true-awk/b.c (revision 9a7741de182b2776d7b30d6355f2585c0780a51b)
1*9a7741deSElliott Hughes /****************************************************************
2*9a7741deSElliott Hughes Copyright (C) Lucent Technologies 1997
3*9a7741deSElliott Hughes All Rights Reserved
4*9a7741deSElliott Hughes 
5*9a7741deSElliott Hughes Permission to use, copy, modify, and distribute this software and
6*9a7741deSElliott Hughes its documentation for any purpose and without fee is hereby
7*9a7741deSElliott Hughes granted, provided that the above copyright notice appear in all
8*9a7741deSElliott Hughes copies and that both that the copyright notice and this
9*9a7741deSElliott Hughes permission notice and warranty disclaimer appear in supporting
10*9a7741deSElliott Hughes documentation, and that the name Lucent Technologies or any of
11*9a7741deSElliott Hughes its entities not be used in advertising or publicity pertaining
12*9a7741deSElliott Hughes to distribution of the software without specific, written prior
13*9a7741deSElliott Hughes permission.
14*9a7741deSElliott Hughes 
15*9a7741deSElliott Hughes LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16*9a7741deSElliott Hughes INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17*9a7741deSElliott Hughes IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18*9a7741deSElliott Hughes SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19*9a7741deSElliott Hughes WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20*9a7741deSElliott Hughes IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21*9a7741deSElliott Hughes ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22*9a7741deSElliott Hughes THIS SOFTWARE.
23*9a7741deSElliott Hughes ****************************************************************/
24*9a7741deSElliott Hughes 
25*9a7741deSElliott Hughes /* lasciate ogne speranza, voi ch'intrate. */
26*9a7741deSElliott Hughes 
27*9a7741deSElliott Hughes #define	DEBUG
28*9a7741deSElliott Hughes 
29*9a7741deSElliott Hughes #include <ctype.h>
30*9a7741deSElliott Hughes #include <limits.h>
31*9a7741deSElliott Hughes #include <stdio.h>
32*9a7741deSElliott Hughes #include <string.h>
33*9a7741deSElliott Hughes #include <stdlib.h>
34*9a7741deSElliott Hughes #include "awk.h"
35*9a7741deSElliott Hughes #include "awkgram.tab.h"
36*9a7741deSElliott Hughes 
37*9a7741deSElliott Hughes #define MAXLIN 22
38*9a7741deSElliott Hughes 
39*9a7741deSElliott Hughes #define type(v)		(v)->nobj	/* badly overloaded here */
40*9a7741deSElliott Hughes #define info(v)		(v)->ntype	/* badly overloaded here */
41*9a7741deSElliott Hughes #define left(v)		(v)->narg[0]
42*9a7741deSElliott Hughes #define right(v)	(v)->narg[1]
43*9a7741deSElliott Hughes #define parent(v)	(v)->nnext
44*9a7741deSElliott Hughes 
45*9a7741deSElliott Hughes #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46*9a7741deSElliott Hughes #define ELEAF	case EMPTYRE:		/* empty string in regexp */
47*9a7741deSElliott Hughes #define UNARY	case STAR: case PLUS: case QUEST:
48*9a7741deSElliott Hughes 
49*9a7741deSElliott Hughes /* encoding in tree Nodes:
50*9a7741deSElliott Hughes 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51*9a7741deSElliott Hughes 		left is index, right contains value or pointer to value
52*9a7741deSElliott Hughes 	unary (STAR, PLUS, QUEST): left is child, right is null
53*9a7741deSElliott Hughes 	binary (CAT, OR): left and right are children
54*9a7741deSElliott Hughes 	parent contains pointer to parent
55*9a7741deSElliott Hughes */
56*9a7741deSElliott Hughes 
57*9a7741deSElliott Hughes 
58*9a7741deSElliott Hughes int	*setvec;
59*9a7741deSElliott Hughes int	*tmpset;
60*9a7741deSElliott Hughes int	maxsetvec = 0;
61*9a7741deSElliott Hughes 
62*9a7741deSElliott Hughes int	rtok;		/* next token in current re */
63*9a7741deSElliott Hughes int	rlxval;
64*9a7741deSElliott Hughes static const uschar	*rlxstr;
65*9a7741deSElliott Hughes static const uschar	*prestr;	/* current position in current re */
66*9a7741deSElliott Hughes static const uschar	*lastre;	/* origin of last re */
67*9a7741deSElliott Hughes static const uschar	*lastatom;	/* origin of last Atom */
68*9a7741deSElliott Hughes static const uschar	*starttok;
69*9a7741deSElliott Hughes static const uschar 	*basestr;	/* starts with original, replaced during
70*9a7741deSElliott Hughes 				   repetition processing */
71*9a7741deSElliott Hughes static const uschar 	*firstbasestr;
72*9a7741deSElliott Hughes 
73*9a7741deSElliott Hughes static	int setcnt;
74*9a7741deSElliott Hughes static	int poscnt;
75*9a7741deSElliott Hughes 
76*9a7741deSElliott Hughes const char	*patbeg;
77*9a7741deSElliott Hughes int	patlen;
78*9a7741deSElliott Hughes 
79*9a7741deSElliott Hughes #define	NFA	128	/* cache this many dynamic fa's */
80*9a7741deSElliott Hughes fa	*fatab[NFA];
81*9a7741deSElliott Hughes int	nfatab	= 0;	/* entries in fatab */
82*9a7741deSElliott Hughes 
83*9a7741deSElliott Hughes extern int u8_nextlen(const char *s);
84*9a7741deSElliott Hughes 
85*9a7741deSElliott Hughes 
86*9a7741deSElliott Hughes /* utf-8 mechanism:
87*9a7741deSElliott Hughes 
88*9a7741deSElliott Hughes    For most of Awk, utf-8 strings just "work", since they look like
89*9a7741deSElliott Hughes    null-terminated sequences of 8-bit bytes.
90*9a7741deSElliott Hughes 
91*9a7741deSElliott Hughes    Functions like length(), index(), and substr() have to operate
92*9a7741deSElliott Hughes    in units of utf-8 characters.  The u8_* functions in run.c
93*9a7741deSElliott Hughes    handle this.
94*9a7741deSElliott Hughes 
95*9a7741deSElliott Hughes    Regular expressions are more complicated, since the basic
96*9a7741deSElliott Hughes    mechanism of the goto table used 8-bit byte indices into the
97*9a7741deSElliott Hughes    gototab entries to compute the next state.  Unicode is a lot
98*9a7741deSElliott Hughes    bigger, so the gototab entries are now structs with a character
99*9a7741deSElliott Hughes    and a next state. These are sorted by code point and binary
100*9a7741deSElliott Hughes    searched.
101*9a7741deSElliott Hughes 
102*9a7741deSElliott Hughes    Throughout the RE mechanism in b.c, utf-8 characters are
103*9a7741deSElliott Hughes    converted to their utf-32 value.  This mostly shows up in
104*9a7741deSElliott Hughes    cclenter, which expands character class ranges like a-z and now
105*9a7741deSElliott Hughes    alpha-omega.  The size of a gototab array is still about 256.
106*9a7741deSElliott Hughes    This should be dynamic, but for now things work ok for a single
107*9a7741deSElliott Hughes    code page of Unicode, which is the most likely case.
108*9a7741deSElliott Hughes 
109*9a7741deSElliott Hughes    The code changes are localized in run.c and b.c.  I have added a
110*9a7741deSElliott Hughes    handful of functions to somewhat better hide the implementation,
111*9a7741deSElliott Hughes    but a lot more could be done.
112*9a7741deSElliott Hughes 
113*9a7741deSElliott Hughes  */
114*9a7741deSElliott Hughes 
115*9a7741deSElliott Hughes static int entry_cmp(const void *l, const void *r);
116*9a7741deSElliott Hughes static int get_gototab(fa*, int, int);
117*9a7741deSElliott Hughes static int set_gototab(fa*, int, int, int);
118*9a7741deSElliott Hughes static void clear_gototab(fa*, int);
119*9a7741deSElliott Hughes extern int u8_rune(int *, const char *);
120*9a7741deSElliott Hughes 
121*9a7741deSElliott Hughes static int *
intalloc(size_t n,const char * f)122*9a7741deSElliott Hughes intalloc(size_t n, const char *f)
123*9a7741deSElliott Hughes {
124*9a7741deSElliott Hughes 	int *p = (int *) calloc(n, sizeof(int));
125*9a7741deSElliott Hughes 	if (p == NULL)
126*9a7741deSElliott Hughes 		overflo(f);
127*9a7741deSElliott Hughes 	return p;
128*9a7741deSElliott Hughes }
129*9a7741deSElliott Hughes 
130*9a7741deSElliott Hughes static void
resizesetvec(const char * f)131*9a7741deSElliott Hughes resizesetvec(const char *f)
132*9a7741deSElliott Hughes {
133*9a7741deSElliott Hughes 	if (maxsetvec == 0)
134*9a7741deSElliott Hughes 		maxsetvec = MAXLIN;
135*9a7741deSElliott Hughes 	else
136*9a7741deSElliott Hughes 		maxsetvec *= 4;
137*9a7741deSElliott Hughes 	setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
138*9a7741deSElliott Hughes 	tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
139*9a7741deSElliott Hughes 	if (setvec == NULL || tmpset == NULL)
140*9a7741deSElliott Hughes 		overflo(f);
141*9a7741deSElliott Hughes }
142*9a7741deSElliott Hughes 
143*9a7741deSElliott Hughes static void
resize_state(fa * f,int state)144*9a7741deSElliott Hughes resize_state(fa *f, int state)
145*9a7741deSElliott Hughes {
146*9a7741deSElliott Hughes 	gtt *p;
147*9a7741deSElliott Hughes 	uschar *p2;
148*9a7741deSElliott Hughes 	int **p3;
149*9a7741deSElliott Hughes 	int i, new_count;
150*9a7741deSElliott Hughes 
151*9a7741deSElliott Hughes 	if (++state < f->state_count)
152*9a7741deSElliott Hughes 		return;
153*9a7741deSElliott Hughes 
154*9a7741deSElliott Hughes 	new_count = state + 10; /* needs to be tuned */
155*9a7741deSElliott Hughes 
156*9a7741deSElliott Hughes 	p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
157*9a7741deSElliott Hughes 	if (p == NULL)
158*9a7741deSElliott Hughes 		goto out;
159*9a7741deSElliott Hughes 	f->gototab = p;
160*9a7741deSElliott Hughes 
161*9a7741deSElliott Hughes 	p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
162*9a7741deSElliott Hughes 	if (p2 == NULL)
163*9a7741deSElliott Hughes 		goto out;
164*9a7741deSElliott Hughes 	f->out = p2;
165*9a7741deSElliott Hughes 
166*9a7741deSElliott Hughes 	p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
167*9a7741deSElliott Hughes 	if (p3 == NULL)
168*9a7741deSElliott Hughes 		goto out;
169*9a7741deSElliott Hughes 	f->posns = p3;
170*9a7741deSElliott Hughes 
171*9a7741deSElliott Hughes 	for (i = f->state_count; i < new_count; ++i) {
172*9a7741deSElliott Hughes 		f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
173*9a7741deSElliott Hughes 		if (f->gototab[i].entries == NULL)
174*9a7741deSElliott Hughes 			goto out;
175*9a7741deSElliott Hughes 		f->gototab[i].allocated = NCHARS;
176*9a7741deSElliott Hughes 		f->gototab[i].inuse = 0;
177*9a7741deSElliott Hughes 		f->out[i] = 0;
178*9a7741deSElliott Hughes 		f->posns[i] = NULL;
179*9a7741deSElliott Hughes 	}
180*9a7741deSElliott Hughes 	f->state_count = new_count;
181*9a7741deSElliott Hughes 	return;
182*9a7741deSElliott Hughes out:
183*9a7741deSElliott Hughes 	overflo(__func__);
184*9a7741deSElliott Hughes }
185*9a7741deSElliott Hughes 
makedfa(const char * s,bool anchor)186*9a7741deSElliott Hughes fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
187*9a7741deSElliott Hughes {
188*9a7741deSElliott Hughes 	int i, use, nuse;
189*9a7741deSElliott Hughes 	fa *pfa;
190*9a7741deSElliott Hughes 	static int now = 1;
191*9a7741deSElliott Hughes 
192*9a7741deSElliott Hughes 	if (setvec == NULL) {	/* first time through any RE */
193*9a7741deSElliott Hughes 		resizesetvec(__func__);
194*9a7741deSElliott Hughes 	}
195*9a7741deSElliott Hughes 
196*9a7741deSElliott Hughes 	if (compile_time != RUNNING)	/* a constant for sure */
197*9a7741deSElliott Hughes 		return mkdfa(s, anchor);
198*9a7741deSElliott Hughes 	for (i = 0; i < nfatab; i++)	/* is it there already? */
199*9a7741deSElliott Hughes 		if (fatab[i]->anchor == anchor
200*9a7741deSElliott Hughes 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
201*9a7741deSElliott Hughes 			fatab[i]->use = now++;
202*9a7741deSElliott Hughes 			return fatab[i];
203*9a7741deSElliott Hughes 		}
204*9a7741deSElliott Hughes 	pfa = mkdfa(s, anchor);
205*9a7741deSElliott Hughes 	if (nfatab < NFA) {	/* room for another */
206*9a7741deSElliott Hughes 		fatab[nfatab] = pfa;
207*9a7741deSElliott Hughes 		fatab[nfatab]->use = now++;
208*9a7741deSElliott Hughes 		nfatab++;
209*9a7741deSElliott Hughes 		return pfa;
210*9a7741deSElliott Hughes 	}
211*9a7741deSElliott Hughes 	use = fatab[0]->use;	/* replace least-recently used */
212*9a7741deSElliott Hughes 	nuse = 0;
213*9a7741deSElliott Hughes 	for (i = 1; i < nfatab; i++)
214*9a7741deSElliott Hughes 		if (fatab[i]->use < use) {
215*9a7741deSElliott Hughes 			use = fatab[i]->use;
216*9a7741deSElliott Hughes 			nuse = i;
217*9a7741deSElliott Hughes 		}
218*9a7741deSElliott Hughes 	freefa(fatab[nuse]);
219*9a7741deSElliott Hughes 	fatab[nuse] = pfa;
220*9a7741deSElliott Hughes 	pfa->use = now++;
221*9a7741deSElliott Hughes 	return pfa;
222*9a7741deSElliott Hughes }
223*9a7741deSElliott Hughes 
mkdfa(const char * s,bool anchor)224*9a7741deSElliott Hughes fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
225*9a7741deSElliott Hughes 				/* anchor = true for anchored matches, else false */
226*9a7741deSElliott Hughes {
227*9a7741deSElliott Hughes 	Node *p, *p1;
228*9a7741deSElliott Hughes 	fa *f;
229*9a7741deSElliott Hughes 
230*9a7741deSElliott Hughes 	firstbasestr = (const uschar *) s;
231*9a7741deSElliott Hughes 	basestr = firstbasestr;
232*9a7741deSElliott Hughes 	p = reparse(s);
233*9a7741deSElliott Hughes 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
234*9a7741deSElliott Hughes 		/* put ALL STAR in front of reg.  exp. */
235*9a7741deSElliott Hughes 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
236*9a7741deSElliott Hughes 		/* put FINAL after reg.  exp. */
237*9a7741deSElliott Hughes 
238*9a7741deSElliott Hughes 	poscnt = 0;
239*9a7741deSElliott Hughes 	penter(p1);	/* enter parent pointers and leaf indices */
240*9a7741deSElliott Hughes 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
241*9a7741deSElliott Hughes 		overflo(__func__);
242*9a7741deSElliott Hughes 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
243*9a7741deSElliott Hughes 	cfoll(f, p1);	/* set up follow sets */
244*9a7741deSElliott Hughes 	freetr(p1);
245*9a7741deSElliott Hughes 	resize_state(f, 1);
246*9a7741deSElliott Hughes 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
247*9a7741deSElliott Hughes 	f->posns[1] = intalloc(1, __func__);
248*9a7741deSElliott Hughes 	*f->posns[1] = 0;
249*9a7741deSElliott Hughes 	f->initstat = makeinit(f, anchor);
250*9a7741deSElliott Hughes 	f->anchor = anchor;
251*9a7741deSElliott Hughes 	f->restr = (uschar *) tostring(s);
252*9a7741deSElliott Hughes 	if (firstbasestr != basestr) {
253*9a7741deSElliott Hughes 		if (basestr)
254*9a7741deSElliott Hughes 			xfree(basestr);
255*9a7741deSElliott Hughes 	}
256*9a7741deSElliott Hughes 	return f;
257*9a7741deSElliott Hughes }
258*9a7741deSElliott Hughes 
makeinit(fa * f,bool anchor)259*9a7741deSElliott Hughes int makeinit(fa *f, bool anchor)
260*9a7741deSElliott Hughes {
261*9a7741deSElliott Hughes 	int i, k;
262*9a7741deSElliott Hughes 
263*9a7741deSElliott Hughes 	f->curstat = 2;
264*9a7741deSElliott Hughes 	f->out[2] = 0;
265*9a7741deSElliott Hughes 	k = *(f->re[0].lfollow);
266*9a7741deSElliott Hughes 	xfree(f->posns[2]);
267*9a7741deSElliott Hughes 	f->posns[2] = intalloc(k + 1,  __func__);
268*9a7741deSElliott Hughes 	for (i = 0; i <= k; i++) {
269*9a7741deSElliott Hughes 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
270*9a7741deSElliott Hughes 	}
271*9a7741deSElliott Hughes 	if ((f->posns[2])[1] == f->accept)
272*9a7741deSElliott Hughes 		f->out[2] = 1;
273*9a7741deSElliott Hughes 	clear_gototab(f, 2);
274*9a7741deSElliott Hughes 	f->curstat = cgoto(f, 2, HAT);
275*9a7741deSElliott Hughes 	if (anchor) {
276*9a7741deSElliott Hughes 		*f->posns[2] = k-1;	/* leave out position 0 */
277*9a7741deSElliott Hughes 		for (i = 0; i < k; i++) {
278*9a7741deSElliott Hughes 			(f->posns[0])[i] = (f->posns[2])[i];
279*9a7741deSElliott Hughes 		}
280*9a7741deSElliott Hughes 
281*9a7741deSElliott Hughes 		f->out[0] = f->out[2];
282*9a7741deSElliott Hughes 		if (f->curstat != 2)
283*9a7741deSElliott Hughes 			--(*f->posns[f->curstat]);
284*9a7741deSElliott Hughes 	}
285*9a7741deSElliott Hughes 	return f->curstat;
286*9a7741deSElliott Hughes }
287*9a7741deSElliott Hughes 
penter(Node * p)288*9a7741deSElliott Hughes void penter(Node *p)	/* set up parent pointers and leaf indices */
289*9a7741deSElliott Hughes {
290*9a7741deSElliott Hughes 	switch (type(p)) {
291*9a7741deSElliott Hughes 	ELEAF
292*9a7741deSElliott Hughes 	LEAF
293*9a7741deSElliott Hughes 		info(p) = poscnt;
294*9a7741deSElliott Hughes 		poscnt++;
295*9a7741deSElliott Hughes 		break;
296*9a7741deSElliott Hughes 	UNARY
297*9a7741deSElliott Hughes 		penter(left(p));
298*9a7741deSElliott Hughes 		parent(left(p)) = p;
299*9a7741deSElliott Hughes 		break;
300*9a7741deSElliott Hughes 	case CAT:
301*9a7741deSElliott Hughes 	case OR:
302*9a7741deSElliott Hughes 		penter(left(p));
303*9a7741deSElliott Hughes 		penter(right(p));
304*9a7741deSElliott Hughes 		parent(left(p)) = p;
305*9a7741deSElliott Hughes 		parent(right(p)) = p;
306*9a7741deSElliott Hughes 		break;
307*9a7741deSElliott Hughes 	case ZERO:
308*9a7741deSElliott Hughes 		break;
309*9a7741deSElliott Hughes 	default:	/* can't happen */
310*9a7741deSElliott Hughes 		FATAL("can't happen: unknown type %d in penter", type(p));
311*9a7741deSElliott Hughes 		break;
312*9a7741deSElliott Hughes 	}
313*9a7741deSElliott Hughes }
314*9a7741deSElliott Hughes 
freetr(Node * p)315*9a7741deSElliott Hughes void freetr(Node *p)	/* free parse tree */
316*9a7741deSElliott Hughes {
317*9a7741deSElliott Hughes 	switch (type(p)) {
318*9a7741deSElliott Hughes 	ELEAF
319*9a7741deSElliott Hughes 	LEAF
320*9a7741deSElliott Hughes 		xfree(p);
321*9a7741deSElliott Hughes 		break;
322*9a7741deSElliott Hughes 	UNARY
323*9a7741deSElliott Hughes 	case ZERO:
324*9a7741deSElliott Hughes 		freetr(left(p));
325*9a7741deSElliott Hughes 		xfree(p);
326*9a7741deSElliott Hughes 		break;
327*9a7741deSElliott Hughes 	case CAT:
328*9a7741deSElliott Hughes 	case OR:
329*9a7741deSElliott Hughes 		freetr(left(p));
330*9a7741deSElliott Hughes 		freetr(right(p));
331*9a7741deSElliott Hughes 		xfree(p);
332*9a7741deSElliott Hughes 		break;
333*9a7741deSElliott Hughes 	default:	/* can't happen */
334*9a7741deSElliott Hughes 		FATAL("can't happen: unknown type %d in freetr", type(p));
335*9a7741deSElliott Hughes 		break;
336*9a7741deSElliott Hughes 	}
337*9a7741deSElliott Hughes }
338*9a7741deSElliott Hughes 
339*9a7741deSElliott Hughes /* in the parsing of regular expressions, metacharacters like . have */
340*9a7741deSElliott Hughes /* to be seen literally;  \056 is not a metacharacter. */
341*9a7741deSElliott Hughes 
hexstr(const uschar ** pp,int max)342*9a7741deSElliott Hughes int hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
343*9a7741deSElliott Hughes {			/* only pick up one 8-bit byte (2 chars) */
344*9a7741deSElliott Hughes 	const uschar *p;
345*9a7741deSElliott Hughes 	int n = 0;
346*9a7741deSElliott Hughes 	int i;
347*9a7741deSElliott Hughes 
348*9a7741deSElliott Hughes 	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
349*9a7741deSElliott Hughes 		if (isdigit((int) *p))
350*9a7741deSElliott Hughes 			n = 16 * n + *p - '0';
351*9a7741deSElliott Hughes 		else if (*p >= 'a' && *p <= 'f')
352*9a7741deSElliott Hughes 			n = 16 * n + *p - 'a' + 10;
353*9a7741deSElliott Hughes 		else if (*p >= 'A' && *p <= 'F')
354*9a7741deSElliott Hughes 			n = 16 * n + *p - 'A' + 10;
355*9a7741deSElliott Hughes 	}
356*9a7741deSElliott Hughes 	*pp = p;
357*9a7741deSElliott Hughes 	return n;
358*9a7741deSElliott Hughes }
359*9a7741deSElliott Hughes 
360*9a7741deSElliott Hughes 
361*9a7741deSElliott Hughes 
362*9a7741deSElliott Hughes #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
363*9a7741deSElliott Hughes 
quoted(const uschar ** pp)364*9a7741deSElliott Hughes int quoted(const uschar **pp)	/* pick up next thing after a \\ */
365*9a7741deSElliott Hughes 			/* and increment *pp */
366*9a7741deSElliott Hughes {
367*9a7741deSElliott Hughes 	const uschar *p = *pp;
368*9a7741deSElliott Hughes 	int c;
369*9a7741deSElliott Hughes 
370*9a7741deSElliott Hughes /* BUG: should advance by utf-8 char even if makes no sense */
371*9a7741deSElliott Hughes 
372*9a7741deSElliott Hughes 	switch ((c = *p++)) {
373*9a7741deSElliott Hughes 	case 't':
374*9a7741deSElliott Hughes 		c = '\t';
375*9a7741deSElliott Hughes 		break;
376*9a7741deSElliott Hughes 	case 'n':
377*9a7741deSElliott Hughes 		c = '\n';
378*9a7741deSElliott Hughes 		break;
379*9a7741deSElliott Hughes 	case 'f':
380*9a7741deSElliott Hughes 		c = '\f';
381*9a7741deSElliott Hughes 		break;
382*9a7741deSElliott Hughes 	case 'r':
383*9a7741deSElliott Hughes 		c = '\r';
384*9a7741deSElliott Hughes 		break;
385*9a7741deSElliott Hughes 	case 'b':
386*9a7741deSElliott Hughes 		c = '\b';
387*9a7741deSElliott Hughes 		break;
388*9a7741deSElliott Hughes 	case 'v':
389*9a7741deSElliott Hughes 		c = '\v';
390*9a7741deSElliott Hughes 		break;
391*9a7741deSElliott Hughes 	case 'a':
392*9a7741deSElliott Hughes 		c = '\a';
393*9a7741deSElliott Hughes 		break;
394*9a7741deSElliott Hughes 	case '\\':
395*9a7741deSElliott Hughes 		c = '\\';
396*9a7741deSElliott Hughes 		break;
397*9a7741deSElliott Hughes 	case 'x': /* 2 hex digits follow */
398*9a7741deSElliott Hughes 		c = hexstr(&p, 2); /* this adds a null if number is invalid */
399*9a7741deSElliott Hughes 		break;
400*9a7741deSElliott Hughes 	case 'u': /* unicode char number up to 8 hex digits */
401*9a7741deSElliott Hughes 		c = hexstr(&p, 8);
402*9a7741deSElliott Hughes 		break;
403*9a7741deSElliott Hughes 	default:
404*9a7741deSElliott Hughes 		if (isoctdigit(c)) { /* \d \dd \ddd */
405*9a7741deSElliott Hughes 			int n = c - '0';
406*9a7741deSElliott Hughes 			if (isoctdigit(*p)) {
407*9a7741deSElliott Hughes 				n = 8 * n + *p++ - '0';
408*9a7741deSElliott Hughes 				if (isoctdigit(*p))
409*9a7741deSElliott Hughes 					n = 8 * n + *p++ - '0';
410*9a7741deSElliott Hughes 			}
411*9a7741deSElliott Hughes 			c = n;
412*9a7741deSElliott Hughes 		}
413*9a7741deSElliott Hughes 	}
414*9a7741deSElliott Hughes 
415*9a7741deSElliott Hughes 	*pp = p;
416*9a7741deSElliott Hughes 	return c;
417*9a7741deSElliott Hughes }
418*9a7741deSElliott Hughes 
cclenter(const char * argp)419*9a7741deSElliott Hughes int *cclenter(const char *argp)	/* add a character class */
420*9a7741deSElliott Hughes {
421*9a7741deSElliott Hughes 	int i, c, c2;
422*9a7741deSElliott Hughes 	int n;
423*9a7741deSElliott Hughes 	const uschar *p = (const uschar *) argp;
424*9a7741deSElliott Hughes 	int *bp, *retp;
425*9a7741deSElliott Hughes 	static int *buf = NULL;
426*9a7741deSElliott Hughes 	static int bufsz = 100;
427*9a7741deSElliott Hughes 
428*9a7741deSElliott Hughes 	if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
429*9a7741deSElliott Hughes 		FATAL("out of space for character class [%.10s...] 1", p);
430*9a7741deSElliott Hughes 	bp = buf;
431*9a7741deSElliott Hughes 	for (i = 0; *p != 0; ) {
432*9a7741deSElliott Hughes 		n = u8_rune(&c, (const char *) p);
433*9a7741deSElliott Hughes 		p += n;
434*9a7741deSElliott Hughes 		if (c == '\\') {
435*9a7741deSElliott Hughes 			c = quoted(&p);
436*9a7741deSElliott Hughes 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
437*9a7741deSElliott Hughes 			if (*p != 0) {
438*9a7741deSElliott Hughes 				c = bp[-1];
439*9a7741deSElliott Hughes 				/* c2 = *p++; */
440*9a7741deSElliott Hughes 				n = u8_rune(&c2, (const char *) p);
441*9a7741deSElliott Hughes 				p += n;
442*9a7741deSElliott Hughes 				if (c2 == '\\')
443*9a7741deSElliott Hughes 					c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
444*9a7741deSElliott Hughes 				if (c > c2) {	/* empty; ignore */
445*9a7741deSElliott Hughes 					bp--;
446*9a7741deSElliott Hughes 					i--;
447*9a7741deSElliott Hughes 					continue;
448*9a7741deSElliott Hughes 				}
449*9a7741deSElliott Hughes 				while (c < c2) {
450*9a7741deSElliott Hughes 					if (i >= bufsz) {
451*9a7741deSElliott Hughes 						bufsz *= 2;
452*9a7741deSElliott Hughes 						buf = (int *) realloc(buf, bufsz * sizeof(int));
453*9a7741deSElliott Hughes 						if (buf == NULL)
454*9a7741deSElliott Hughes 							FATAL("out of space for character class [%.10s...] 2", p);
455*9a7741deSElliott Hughes 						bp = buf + i;
456*9a7741deSElliott Hughes 					}
457*9a7741deSElliott Hughes 					*bp++ = ++c;
458*9a7741deSElliott Hughes 					i++;
459*9a7741deSElliott Hughes 				}
460*9a7741deSElliott Hughes 				continue;
461*9a7741deSElliott Hughes 			}
462*9a7741deSElliott Hughes 		}
463*9a7741deSElliott Hughes 		if (i >= bufsz) {
464*9a7741deSElliott Hughes 			bufsz *= 2;
465*9a7741deSElliott Hughes 			buf = (int *) realloc(buf, bufsz * sizeof(int));
466*9a7741deSElliott Hughes 			if (buf == NULL)
467*9a7741deSElliott Hughes 				FATAL("out of space for character class [%.10s...] 2", p);
468*9a7741deSElliott Hughes 			bp = buf + i;
469*9a7741deSElliott Hughes 		}
470*9a7741deSElliott Hughes 		*bp++ = c;
471*9a7741deSElliott Hughes 		i++;
472*9a7741deSElliott Hughes 	}
473*9a7741deSElliott Hughes 	*bp = 0;
474*9a7741deSElliott Hughes 	/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
475*9a7741deSElliott Hughes 	/* xfree(op);  BUG: what are we freeing here? */
476*9a7741deSElliott Hughes 	retp = (int *) calloc(bp-buf+1, sizeof(int));
477*9a7741deSElliott Hughes 	for (i = 0; i < bp-buf+1; i++)
478*9a7741deSElliott Hughes 		retp[i] = buf[i];
479*9a7741deSElliott Hughes 	return retp;
480*9a7741deSElliott Hughes }
481*9a7741deSElliott Hughes 
overflo(const char * s)482*9a7741deSElliott Hughes void overflo(const char *s)
483*9a7741deSElliott Hughes {
484*9a7741deSElliott Hughes 	FATAL("regular expression too big: out of space in %.30s...", s);
485*9a7741deSElliott Hughes }
486*9a7741deSElliott Hughes 
cfoll(fa * f,Node * v)487*9a7741deSElliott Hughes void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
488*9a7741deSElliott Hughes {
489*9a7741deSElliott Hughes 	int i;
490*9a7741deSElliott Hughes 	int *p;
491*9a7741deSElliott Hughes 
492*9a7741deSElliott Hughes 	switch (type(v)) {
493*9a7741deSElliott Hughes 	ELEAF
494*9a7741deSElliott Hughes 	LEAF
495*9a7741deSElliott Hughes 		f->re[info(v)].ltype = type(v);
496*9a7741deSElliott Hughes 		f->re[info(v)].lval.np = right(v);
497*9a7741deSElliott Hughes 		while (f->accept >= maxsetvec) {	/* guessing here! */
498*9a7741deSElliott Hughes 			resizesetvec(__func__);
499*9a7741deSElliott Hughes 		}
500*9a7741deSElliott Hughes 		for (i = 0; i <= f->accept; i++)
501*9a7741deSElliott Hughes 			setvec[i] = 0;
502*9a7741deSElliott Hughes 		setcnt = 0;
503*9a7741deSElliott Hughes 		follow(v);	/* computes setvec and setcnt */
504*9a7741deSElliott Hughes 		p = intalloc(setcnt + 1, __func__);
505*9a7741deSElliott Hughes 		f->re[info(v)].lfollow = p;
506*9a7741deSElliott Hughes 		*p = setcnt;
507*9a7741deSElliott Hughes 		for (i = f->accept; i >= 0; i--)
508*9a7741deSElliott Hughes 			if (setvec[i] == 1)
509*9a7741deSElliott Hughes 				*++p = i;
510*9a7741deSElliott Hughes 		break;
511*9a7741deSElliott Hughes 	UNARY
512*9a7741deSElliott Hughes 		cfoll(f,left(v));
513*9a7741deSElliott Hughes 		break;
514*9a7741deSElliott Hughes 	case CAT:
515*9a7741deSElliott Hughes 	case OR:
516*9a7741deSElliott Hughes 		cfoll(f,left(v));
517*9a7741deSElliott Hughes 		cfoll(f,right(v));
518*9a7741deSElliott Hughes 		break;
519*9a7741deSElliott Hughes 	case ZERO:
520*9a7741deSElliott Hughes 		break;
521*9a7741deSElliott Hughes 	default:	/* can't happen */
522*9a7741deSElliott Hughes 		FATAL("can't happen: unknown type %d in cfoll", type(v));
523*9a7741deSElliott Hughes 	}
524*9a7741deSElliott Hughes }
525*9a7741deSElliott Hughes 
first(Node * p)526*9a7741deSElliott Hughes int first(Node *p)	/* collects initially active leaves of p into setvec */
527*9a7741deSElliott Hughes 			/* returns 0 if p matches empty string */
528*9a7741deSElliott Hughes {
529*9a7741deSElliott Hughes 	int b, lp;
530*9a7741deSElliott Hughes 
531*9a7741deSElliott Hughes 	switch (type(p)) {
532*9a7741deSElliott Hughes 	ELEAF
533*9a7741deSElliott Hughes 	LEAF
534*9a7741deSElliott Hughes 		lp = info(p);	/* look for high-water mark of subscripts */
535*9a7741deSElliott Hughes 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
536*9a7741deSElliott Hughes 			resizesetvec(__func__);
537*9a7741deSElliott Hughes 		}
538*9a7741deSElliott Hughes 		if (type(p) == EMPTYRE) {
539*9a7741deSElliott Hughes 			setvec[lp] = 0;
540*9a7741deSElliott Hughes 			return(0);
541*9a7741deSElliott Hughes 		}
542*9a7741deSElliott Hughes 		if (setvec[lp] != 1) {
543*9a7741deSElliott Hughes 			setvec[lp] = 1;
544*9a7741deSElliott Hughes 			setcnt++;
545*9a7741deSElliott Hughes 		}
546*9a7741deSElliott Hughes 		if (type(p) == CCL && (*(int *) right(p)) == 0)
547*9a7741deSElliott Hughes 			return(0);		/* empty CCL */
548*9a7741deSElliott Hughes 		return(1);
549*9a7741deSElliott Hughes 	case PLUS:
550*9a7741deSElliott Hughes 		if (first(left(p)) == 0)
551*9a7741deSElliott Hughes 			return(0);
552*9a7741deSElliott Hughes 		return(1);
553*9a7741deSElliott Hughes 	case STAR:
554*9a7741deSElliott Hughes 	case QUEST:
555*9a7741deSElliott Hughes 		first(left(p));
556*9a7741deSElliott Hughes 		return(0);
557*9a7741deSElliott Hughes 	case CAT:
558*9a7741deSElliott Hughes 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
559*9a7741deSElliott Hughes 		return(1);
560*9a7741deSElliott Hughes 	case OR:
561*9a7741deSElliott Hughes 		b = first(right(p));
562*9a7741deSElliott Hughes 		if (first(left(p)) == 0 || b == 0) return(0);
563*9a7741deSElliott Hughes 		return(1);
564*9a7741deSElliott Hughes 	case ZERO:
565*9a7741deSElliott Hughes 		return 0;
566*9a7741deSElliott Hughes 	}
567*9a7741deSElliott Hughes 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
568*9a7741deSElliott Hughes 	return(-1);
569*9a7741deSElliott Hughes }
570*9a7741deSElliott Hughes 
follow(Node * v)571*9a7741deSElliott Hughes void follow(Node *v)	/* collects leaves that can follow v into setvec */
572*9a7741deSElliott Hughes {
573*9a7741deSElliott Hughes 	Node *p;
574*9a7741deSElliott Hughes 
575*9a7741deSElliott Hughes 	if (type(v) == FINAL)
576*9a7741deSElliott Hughes 		return;
577*9a7741deSElliott Hughes 	p = parent(v);
578*9a7741deSElliott Hughes 	switch (type(p)) {
579*9a7741deSElliott Hughes 	case STAR:
580*9a7741deSElliott Hughes 	case PLUS:
581*9a7741deSElliott Hughes 		first(v);
582*9a7741deSElliott Hughes 		follow(p);
583*9a7741deSElliott Hughes 		return;
584*9a7741deSElliott Hughes 
585*9a7741deSElliott Hughes 	case OR:
586*9a7741deSElliott Hughes 	case QUEST:
587*9a7741deSElliott Hughes 		follow(p);
588*9a7741deSElliott Hughes 		return;
589*9a7741deSElliott Hughes 
590*9a7741deSElliott Hughes 	case CAT:
591*9a7741deSElliott Hughes 		if (v == left(p)) {	/* v is left child of p */
592*9a7741deSElliott Hughes 			if (first(right(p)) == 0) {
593*9a7741deSElliott Hughes 				follow(p);
594*9a7741deSElliott Hughes 				return;
595*9a7741deSElliott Hughes 			}
596*9a7741deSElliott Hughes 		} else		/* v is right child */
597*9a7741deSElliott Hughes 			follow(p);
598*9a7741deSElliott Hughes 		return;
599*9a7741deSElliott Hughes 	}
600*9a7741deSElliott Hughes }
601*9a7741deSElliott Hughes 
member(int c,int * sarg)602*9a7741deSElliott Hughes int member(int c, int *sarg)	/* is c in s? */
603*9a7741deSElliott Hughes {
604*9a7741deSElliott Hughes 	int *s = (int *) sarg;
605*9a7741deSElliott Hughes 
606*9a7741deSElliott Hughes 	while (*s)
607*9a7741deSElliott Hughes 		if (c == *s++)
608*9a7741deSElliott Hughes 			return(1);
609*9a7741deSElliott Hughes 	return(0);
610*9a7741deSElliott Hughes }
611*9a7741deSElliott Hughes 
resize_gototab(fa * f,int state)612*9a7741deSElliott Hughes static void resize_gototab(fa *f, int state)
613*9a7741deSElliott Hughes {
614*9a7741deSElliott Hughes 	size_t new_size = f->gototab[state].allocated * 2;
615*9a7741deSElliott Hughes 	gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
616*9a7741deSElliott Hughes 	if (p == NULL)
617*9a7741deSElliott Hughes 		overflo(__func__);
618*9a7741deSElliott Hughes 
619*9a7741deSElliott Hughes 	// need to initialized the new memory to zero
620*9a7741deSElliott Hughes 	size_t orig_size = f->gototab[state].allocated;		// 2nd half of new mem is this size
621*9a7741deSElliott Hughes 	memset(p + orig_size, 0, orig_size * sizeof(gtte));	// clean it out
622*9a7741deSElliott Hughes 
623*9a7741deSElliott Hughes 	f->gototab[state].allocated = new_size;			// update gototab info
624*9a7741deSElliott Hughes 	f->gototab[state].entries = p;
625*9a7741deSElliott Hughes }
626*9a7741deSElliott Hughes 
get_gototab(fa * f,int state,int ch)627*9a7741deSElliott Hughes static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
628*9a7741deSElliott Hughes {
629*9a7741deSElliott Hughes 	gtte key;
630*9a7741deSElliott Hughes 	gtte *item;
631*9a7741deSElliott Hughes 
632*9a7741deSElliott Hughes 	key.ch = ch;
633*9a7741deSElliott Hughes 	key.state = 0;	/* irrelevant */
634*9a7741deSElliott Hughes 	item = (gtte *) bsearch(& key, f->gototab[state].entries,
635*9a7741deSElliott Hughes 			f->gototab[state].inuse, sizeof(gtte),
636*9a7741deSElliott Hughes 			entry_cmp);
637*9a7741deSElliott Hughes 
638*9a7741deSElliott Hughes 	if (item == NULL)
639*9a7741deSElliott Hughes 		return 0;
640*9a7741deSElliott Hughes 	else
641*9a7741deSElliott Hughes 		return item->state;
642*9a7741deSElliott Hughes }
643*9a7741deSElliott Hughes 
entry_cmp(const void * l,const void * r)644*9a7741deSElliott Hughes static int entry_cmp(const void *l, const void *r)
645*9a7741deSElliott Hughes {
646*9a7741deSElliott Hughes 	const gtte *left, *right;
647*9a7741deSElliott Hughes 
648*9a7741deSElliott Hughes 	left = (const gtte *) l;
649*9a7741deSElliott Hughes 	right = (const gtte *) r;
650*9a7741deSElliott Hughes 
651*9a7741deSElliott Hughes 	return left->ch - right->ch;
652*9a7741deSElliott Hughes }
653*9a7741deSElliott Hughes 
set_gototab(fa * f,int state,int ch,int val)654*9a7741deSElliott Hughes static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
655*9a7741deSElliott Hughes {
656*9a7741deSElliott Hughes 	if (f->gototab[state].inuse == 0) {
657*9a7741deSElliott Hughes 		f->gototab[state].entries[0].ch = ch;
658*9a7741deSElliott Hughes 		f->gototab[state].entries[0].state = val;
659*9a7741deSElliott Hughes 		f->gototab[state].inuse++;
660*9a7741deSElliott Hughes 		return val;
661*9a7741deSElliott Hughes 	} else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
662*9a7741deSElliott Hughes 		// not seen yet, insert and return
663*9a7741deSElliott Hughes 		gtt *tab = & f->gototab[state];
664*9a7741deSElliott Hughes 		if (tab->inuse + 1 >= tab->allocated)
665*9a7741deSElliott Hughes 			resize_gototab(f, state);
666*9a7741deSElliott Hughes 
667*9a7741deSElliott Hughes 		f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
668*9a7741deSElliott Hughes 		f->gototab[state].entries[f->gototab[state].inuse].state = val;
669*9a7741deSElliott Hughes 		f->gototab[state].inuse++;
670*9a7741deSElliott Hughes 		return val;
671*9a7741deSElliott Hughes 	} else {
672*9a7741deSElliott Hughes 		// maybe we have it, maybe we don't
673*9a7741deSElliott Hughes 		gtte key;
674*9a7741deSElliott Hughes 		gtte *item;
675*9a7741deSElliott Hughes 
676*9a7741deSElliott Hughes 		key.ch = ch;
677*9a7741deSElliott Hughes 		key.state = 0;	/* irrelevant */
678*9a7741deSElliott Hughes 		item = (gtte *) bsearch(& key, f->gototab[state].entries,
679*9a7741deSElliott Hughes 				f->gototab[state].inuse, sizeof(gtte),
680*9a7741deSElliott Hughes 				entry_cmp);
681*9a7741deSElliott Hughes 
682*9a7741deSElliott Hughes 		if (item != NULL) {
683*9a7741deSElliott Hughes 			// we have it, update state and return
684*9a7741deSElliott Hughes 			item->state = val;
685*9a7741deSElliott Hughes 			return item->state;
686*9a7741deSElliott Hughes 		}
687*9a7741deSElliott Hughes 		// otherwise, fall through to insert and reallocate.
688*9a7741deSElliott Hughes 	}
689*9a7741deSElliott Hughes 
690*9a7741deSElliott Hughes 	gtt *tab = & f->gototab[state];
691*9a7741deSElliott Hughes 	if (tab->inuse + 1 >= tab->allocated)
692*9a7741deSElliott Hughes 		resize_gototab(f, state);
693*9a7741deSElliott Hughes 	f->gototab[state].entries[tab->inuse].ch = ch;
694*9a7741deSElliott Hughes 	f->gototab[state].entries[tab->inuse].state = val;
695*9a7741deSElliott Hughes 	++tab->inuse;
696*9a7741deSElliott Hughes 
697*9a7741deSElliott Hughes 	qsort(f->gototab[state].entries,
698*9a7741deSElliott Hughes 		f->gototab[state].inuse, sizeof(gtte), entry_cmp);
699*9a7741deSElliott Hughes 
700*9a7741deSElliott Hughes 	return val; /* not used anywhere at the moment */
701*9a7741deSElliott Hughes }
702*9a7741deSElliott Hughes 
clear_gototab(fa * f,int state)703*9a7741deSElliott Hughes static void clear_gototab(fa *f, int state)
704*9a7741deSElliott Hughes {
705*9a7741deSElliott Hughes 	memset(f->gototab[state].entries, 0,
706*9a7741deSElliott Hughes 		f->gototab[state].allocated * sizeof(gtte));
707*9a7741deSElliott Hughes 	f->gototab[state].inuse = 0;
708*9a7741deSElliott Hughes }
709*9a7741deSElliott Hughes 
match(fa * f,const char * p0)710*9a7741deSElliott Hughes int match(fa *f, const char *p0)	/* shortest match ? */
711*9a7741deSElliott Hughes {
712*9a7741deSElliott Hughes 	int s, ns;
713*9a7741deSElliott Hughes 	int n;
714*9a7741deSElliott Hughes 	int rune;
715*9a7741deSElliott Hughes 	const uschar *p = (const uschar *) p0;
716*9a7741deSElliott Hughes 
717*9a7741deSElliott Hughes 	/* return pmatch(f, p0); does it matter whether longest or shortest? */
718*9a7741deSElliott Hughes 
719*9a7741deSElliott Hughes 	s = f->initstat;
720*9a7741deSElliott Hughes 	assert (s < f->state_count);
721*9a7741deSElliott Hughes 
722*9a7741deSElliott Hughes 	if (f->out[s])
723*9a7741deSElliott Hughes 		return(1);
724*9a7741deSElliott Hughes 	do {
725*9a7741deSElliott Hughes 		/* assert(*p < NCHARS); */
726*9a7741deSElliott Hughes 		n = u8_rune(&rune, (const char *) p);
727*9a7741deSElliott Hughes 		if ((ns = get_gototab(f, s, rune)) != 0)
728*9a7741deSElliott Hughes 			s = ns;
729*9a7741deSElliott Hughes 		else
730*9a7741deSElliott Hughes 			s = cgoto(f, s, rune);
731*9a7741deSElliott Hughes 		if (f->out[s])
732*9a7741deSElliott Hughes 			return(1);
733*9a7741deSElliott Hughes 		if (*p == 0)
734*9a7741deSElliott Hughes 			break;
735*9a7741deSElliott Hughes 		p += n;
736*9a7741deSElliott Hughes 	} while (1);  /* was *p++ != 0 */
737*9a7741deSElliott Hughes 	return(0);
738*9a7741deSElliott Hughes }
739*9a7741deSElliott Hughes 
pmatch(fa * f,const char * p0)740*9a7741deSElliott Hughes int pmatch(fa *f, const char *p0)	/* longest match, for sub */
741*9a7741deSElliott Hughes {
742*9a7741deSElliott Hughes 	int s, ns;
743*9a7741deSElliott Hughes 	int n;
744*9a7741deSElliott Hughes 	int rune;
745*9a7741deSElliott Hughes 	const uschar *p = (const uschar *) p0;
746*9a7741deSElliott Hughes 	const uschar *q;
747*9a7741deSElliott Hughes 
748*9a7741deSElliott Hughes 	s = f->initstat;
749*9a7741deSElliott Hughes 	assert(s < f->state_count);
750*9a7741deSElliott Hughes 
751*9a7741deSElliott Hughes 	patbeg = (const char *)p;
752*9a7741deSElliott Hughes 	patlen = -1;
753*9a7741deSElliott Hughes 	do {
754*9a7741deSElliott Hughes 		q = p;
755*9a7741deSElliott Hughes 		do {
756*9a7741deSElliott Hughes 			if (f->out[s])		/* final state */
757*9a7741deSElliott Hughes 				patlen = q-p;
758*9a7741deSElliott Hughes 			/* assert(*q < NCHARS); */
759*9a7741deSElliott Hughes 			n = u8_rune(&rune, (const char *) q);
760*9a7741deSElliott Hughes 			if ((ns = get_gototab(f, s, rune)) != 0)
761*9a7741deSElliott Hughes 				s = ns;
762*9a7741deSElliott Hughes 			else
763*9a7741deSElliott Hughes 				s = cgoto(f, s, rune);
764*9a7741deSElliott Hughes 
765*9a7741deSElliott Hughes 			assert(s < f->state_count);
766*9a7741deSElliott Hughes 
767*9a7741deSElliott Hughes 			if (s == 1) {	/* no transition */
768*9a7741deSElliott Hughes 				if (patlen >= 0) {
769*9a7741deSElliott Hughes 					patbeg = (const char *) p;
770*9a7741deSElliott Hughes 					return(1);
771*9a7741deSElliott Hughes 				}
772*9a7741deSElliott Hughes 				else
773*9a7741deSElliott Hughes 					goto nextin;	/* no match */
774*9a7741deSElliott Hughes 			}
775*9a7741deSElliott Hughes 			if (*q == 0)
776*9a7741deSElliott Hughes 				break;
777*9a7741deSElliott Hughes 			q += n;
778*9a7741deSElliott Hughes 		} while (1);
779*9a7741deSElliott Hughes 		q++;  /* was *q++ */
780*9a7741deSElliott Hughes 		if (f->out[s])
781*9a7741deSElliott Hughes 			patlen = q-p-1;	/* don't count $ */
782*9a7741deSElliott Hughes 		if (patlen >= 0) {
783*9a7741deSElliott Hughes 			patbeg = (const char *) p;
784*9a7741deSElliott Hughes 			return(1);
785*9a7741deSElliott Hughes 		}
786*9a7741deSElliott Hughes 	nextin:
787*9a7741deSElliott Hughes 		s = 2;
788*9a7741deSElliott Hughes 		if (*p == 0)
789*9a7741deSElliott Hughes 			break;
790*9a7741deSElliott Hughes 		n = u8_rune(&rune, (const char *) p);
791*9a7741deSElliott Hughes 		p += n;
792*9a7741deSElliott Hughes 	} while (1); /* was *p++ */
793*9a7741deSElliott Hughes 	return (0);
794*9a7741deSElliott Hughes }
795*9a7741deSElliott Hughes 
nematch(fa * f,const char * p0)796*9a7741deSElliott Hughes int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
797*9a7741deSElliott Hughes {
798*9a7741deSElliott Hughes 	int s, ns;
799*9a7741deSElliott Hughes         int n;
800*9a7741deSElliott Hughes         int rune;
801*9a7741deSElliott Hughes 	const uschar *p = (const uschar *) p0;
802*9a7741deSElliott Hughes 	const uschar *q;
803*9a7741deSElliott Hughes 
804*9a7741deSElliott Hughes 	s = f->initstat;
805*9a7741deSElliott Hughes 	assert(s < f->state_count);
806*9a7741deSElliott Hughes 
807*9a7741deSElliott Hughes 	patbeg = (const char *)p;
808*9a7741deSElliott Hughes 	patlen = -1;
809*9a7741deSElliott Hughes 	while (*p) {
810*9a7741deSElliott Hughes 		q = p;
811*9a7741deSElliott Hughes 		do {
812*9a7741deSElliott Hughes 			if (f->out[s])		/* final state */
813*9a7741deSElliott Hughes 				patlen = q-p;
814*9a7741deSElliott Hughes 			/* assert(*q < NCHARS); */
815*9a7741deSElliott Hughes 			n = u8_rune(&rune, (const char *) q);
816*9a7741deSElliott Hughes 			if ((ns = get_gototab(f, s, rune)) != 0)
817*9a7741deSElliott Hughes 				s = ns;
818*9a7741deSElliott Hughes 			else
819*9a7741deSElliott Hughes 				s = cgoto(f, s, rune);
820*9a7741deSElliott Hughes 			if (s == 1) {	/* no transition */
821*9a7741deSElliott Hughes 				if (patlen > 0) {
822*9a7741deSElliott Hughes 					patbeg = (const char *) p;
823*9a7741deSElliott Hughes 					return(1);
824*9a7741deSElliott Hughes 				} else
825*9a7741deSElliott Hughes 					goto nnextin;	/* no nonempty match */
826*9a7741deSElliott Hughes 			}
827*9a7741deSElliott Hughes 			if (*q == 0)
828*9a7741deSElliott Hughes 				break;
829*9a7741deSElliott Hughes 			q += n;
830*9a7741deSElliott Hughes 		} while (1);
831*9a7741deSElliott Hughes 		q++;
832*9a7741deSElliott Hughes 		if (f->out[s])
833*9a7741deSElliott Hughes 			patlen = q-p-1;	/* don't count $ */
834*9a7741deSElliott Hughes 		if (patlen > 0 ) {
835*9a7741deSElliott Hughes 			patbeg = (const char *) p;
836*9a7741deSElliott Hughes 			return(1);
837*9a7741deSElliott Hughes 		}
838*9a7741deSElliott Hughes 	nnextin:
839*9a7741deSElliott Hughes 		s = 2;
840*9a7741deSElliott Hughes 		p++;
841*9a7741deSElliott Hughes 	}
842*9a7741deSElliott Hughes 	return (0);
843*9a7741deSElliott Hughes }
844*9a7741deSElliott Hughes 
845*9a7741deSElliott Hughes 
846*9a7741deSElliott Hughes /*
847*9a7741deSElliott Hughes  * NAME
848*9a7741deSElliott Hughes  *     fnematch
849*9a7741deSElliott Hughes  *
850*9a7741deSElliott Hughes  * DESCRIPTION
851*9a7741deSElliott Hughes  *     A stream-fed version of nematch which transfers characters to a
852*9a7741deSElliott Hughes  *     null-terminated buffer. All characters up to and including the last
853*9a7741deSElliott Hughes  *     character of the matching text or EOF are placed in the buffer. If
854*9a7741deSElliott Hughes  *     a match is found, patbeg and patlen are set appropriately.
855*9a7741deSElliott Hughes  *
856*9a7741deSElliott Hughes  * RETURN VALUES
857*9a7741deSElliott Hughes  *     false    No match found.
858*9a7741deSElliott Hughes  *     true     Match found.
859*9a7741deSElliott Hughes  */
860*9a7741deSElliott Hughes 
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)861*9a7741deSElliott Hughes bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
862*9a7741deSElliott Hughes {
863*9a7741deSElliott Hughes 	char *i, *j, *k, *buf = *pbuf;
864*9a7741deSElliott Hughes 	int bufsize = *pbufsize;
865*9a7741deSElliott Hughes 	int c, n, ns, s;
866*9a7741deSElliott Hughes 
867*9a7741deSElliott Hughes 	s = pfa->initstat;
868*9a7741deSElliott Hughes 	patlen = 0;
869*9a7741deSElliott Hughes 
870*9a7741deSElliott Hughes 	/*
871*9a7741deSElliott Hughes 	 * buf <= i <= j <= k <= buf+bufsize
872*9a7741deSElliott Hughes 	 *
873*9a7741deSElliott Hughes 	 * i: origin of active substring
874*9a7741deSElliott Hughes 	 * j: current character
875*9a7741deSElliott Hughes 	 * k: destination of the next getc
876*9a7741deSElliott Hughes 	 */
877*9a7741deSElliott Hughes 
878*9a7741deSElliott Hughes 	i = j = k = buf;
879*9a7741deSElliott Hughes 
880*9a7741deSElliott Hughes 	do {
881*9a7741deSElliott Hughes 		/*
882*9a7741deSElliott Hughes 		 * Call u8_rune with at least awk_mb_cur_max ahead in
883*9a7741deSElliott Hughes 		 * the buffer until EOF interferes.
884*9a7741deSElliott Hughes 		 */
885*9a7741deSElliott Hughes 		if (k - j < (int)awk_mb_cur_max) {
886*9a7741deSElliott Hughes 			if (k + awk_mb_cur_max > buf + bufsize) {
887*9a7741deSElliott Hughes 				char *obuf = buf;
888*9a7741deSElliott Hughes 				adjbuf((char **) &buf, &bufsize,
889*9a7741deSElliott Hughes 				    bufsize + awk_mb_cur_max,
890*9a7741deSElliott Hughes 				    quantum, 0, "fnematch");
891*9a7741deSElliott Hughes 
892*9a7741deSElliott Hughes 				/* buf resized, maybe moved. update pointers */
893*9a7741deSElliott Hughes 				*pbufsize = bufsize;
894*9a7741deSElliott Hughes 				if (obuf != buf) {
895*9a7741deSElliott Hughes 					i = buf + (i - obuf);
896*9a7741deSElliott Hughes 					j = buf + (j - obuf);
897*9a7741deSElliott Hughes 					k = buf + (k - obuf);
898*9a7741deSElliott Hughes 					*pbuf = buf;
899*9a7741deSElliott Hughes 					if (patlen)
900*9a7741deSElliott Hughes 						patbeg = buf + (patbeg - obuf);
901*9a7741deSElliott Hughes 				}
902*9a7741deSElliott Hughes 			}
903*9a7741deSElliott Hughes 			for (n = awk_mb_cur_max ; n > 0; n--) {
904*9a7741deSElliott Hughes 				*k++ = (c = getc(f)) != EOF ? c : 0;
905*9a7741deSElliott Hughes 				if (c == EOF) {
906*9a7741deSElliott Hughes 					if (ferror(f))
907*9a7741deSElliott Hughes 						FATAL("fnematch: getc error");
908*9a7741deSElliott Hughes 					break;
909*9a7741deSElliott Hughes 				}
910*9a7741deSElliott Hughes 			}
911*9a7741deSElliott Hughes 		}
912*9a7741deSElliott Hughes 
913*9a7741deSElliott Hughes 		j += u8_rune(&c, j);
914*9a7741deSElliott Hughes 
915*9a7741deSElliott Hughes 		if ((ns = get_gototab(pfa, s, c)) != 0)
916*9a7741deSElliott Hughes 			s = ns;
917*9a7741deSElliott Hughes 		else
918*9a7741deSElliott Hughes 			s = cgoto(pfa, s, c);
919*9a7741deSElliott Hughes 
920*9a7741deSElliott Hughes 		if (pfa->out[s]) {	/* final state */
921*9a7741deSElliott Hughes 			patbeg = i;
922*9a7741deSElliott Hughes 			patlen = j - i;
923*9a7741deSElliott Hughes 			if (c == 0)	/* don't count $ */
924*9a7741deSElliott Hughes 				patlen--;
925*9a7741deSElliott Hughes 		}
926*9a7741deSElliott Hughes 
927*9a7741deSElliott Hughes 		if (c && s != 1)
928*9a7741deSElliott Hughes 			continue;  /* origin i still viable, next j */
929*9a7741deSElliott Hughes 		if (patlen)
930*9a7741deSElliott Hughes 			break;     /* best match found */
931*9a7741deSElliott Hughes 
932*9a7741deSElliott Hughes 		/* no match at origin i, next i and start over */
933*9a7741deSElliott Hughes 		i += u8_rune(&c, i);
934*9a7741deSElliott Hughes 		if (c == 0)
935*9a7741deSElliott Hughes 			break;    /* no match */
936*9a7741deSElliott Hughes 		j = i;
937*9a7741deSElliott Hughes 		s = 2;
938*9a7741deSElliott Hughes 	} while (1);
939*9a7741deSElliott Hughes 
940*9a7741deSElliott Hughes 	if (patlen) {
941*9a7741deSElliott Hughes 		/*
942*9a7741deSElliott Hughes 		 * Under no circumstances is the last character fed to
943*9a7741deSElliott Hughes 		 * the automaton part of the match. It is EOF's nullbyte,
944*9a7741deSElliott Hughes 		 * or it sent the automaton into a state with no further
945*9a7741deSElliott Hughes 		 * transitions available (s==1), or both. Room for a
946*9a7741deSElliott Hughes 		 * terminating nullbyte is guaranteed.
947*9a7741deSElliott Hughes 		 *
948*9a7741deSElliott Hughes 		 * ungetc any chars after the end of matching text
949*9a7741deSElliott Hughes 		 * (except for EOF's nullbyte, if present) and null
950*9a7741deSElliott Hughes 		 * terminate the buffer.
951*9a7741deSElliott Hughes 		 */
952*9a7741deSElliott Hughes 		do
953*9a7741deSElliott Hughes 			if (*--k && ungetc(*k, f) == EOF)
954*9a7741deSElliott Hughes 				FATAL("unable to ungetc '%c'", *k);
955*9a7741deSElliott Hughes 		while (k > patbeg + patlen);
956*9a7741deSElliott Hughes 		*k = '\0';
957*9a7741deSElliott Hughes 		return true;
958*9a7741deSElliott Hughes 	}
959*9a7741deSElliott Hughes 	else
960*9a7741deSElliott Hughes 		return false;
961*9a7741deSElliott Hughes }
962*9a7741deSElliott Hughes 
reparse(const char * p)963*9a7741deSElliott Hughes Node *reparse(const char *p)	/* parses regular expression pointed to by p */
964*9a7741deSElliott Hughes {			/* uses relex() to scan regular expression */
965*9a7741deSElliott Hughes 	Node *np;
966*9a7741deSElliott Hughes 
967*9a7741deSElliott Hughes 	DPRINTF("reparse <%s>\n", p);
968*9a7741deSElliott Hughes 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
969*9a7741deSElliott Hughes 	rtok = relex();
970*9a7741deSElliott Hughes 	/* GNU compatibility: an empty regexp matches anything */
971*9a7741deSElliott Hughes 	if (rtok == '\0') {
972*9a7741deSElliott Hughes 		/* FATAL("empty regular expression"); previous */
973*9a7741deSElliott Hughes 		return(op2(EMPTYRE, NIL, NIL));
974*9a7741deSElliott Hughes 	}
975*9a7741deSElliott Hughes 	np = regexp();
976*9a7741deSElliott Hughes 	if (rtok != '\0')
977*9a7741deSElliott Hughes 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
978*9a7741deSElliott Hughes 	return(np);
979*9a7741deSElliott Hughes }
980*9a7741deSElliott Hughes 
regexp(void)981*9a7741deSElliott Hughes Node *regexp(void)	/* top-level parse of reg expr */
982*9a7741deSElliott Hughes {
983*9a7741deSElliott Hughes 	return (alt(concat(primary())));
984*9a7741deSElliott Hughes }
985*9a7741deSElliott Hughes 
primary(void)986*9a7741deSElliott Hughes Node *primary(void)
987*9a7741deSElliott Hughes {
988*9a7741deSElliott Hughes 	Node *np;
989*9a7741deSElliott Hughes 	int savelastatom;
990*9a7741deSElliott Hughes 
991*9a7741deSElliott Hughes 	switch (rtok) {
992*9a7741deSElliott Hughes 	case CHAR:
993*9a7741deSElliott Hughes 		lastatom = starttok;
994*9a7741deSElliott Hughes 		np = op2(CHAR, NIL, itonp(rlxval));
995*9a7741deSElliott Hughes 		rtok = relex();
996*9a7741deSElliott Hughes 		return (unary(np));
997*9a7741deSElliott Hughes 	case ALL:
998*9a7741deSElliott Hughes 		rtok = relex();
999*9a7741deSElliott Hughes 		return (unary(op2(ALL, NIL, NIL)));
1000*9a7741deSElliott Hughes 	case EMPTYRE:
1001*9a7741deSElliott Hughes 		rtok = relex();
1002*9a7741deSElliott Hughes 		return (unary(op2(EMPTYRE, NIL, NIL)));
1003*9a7741deSElliott Hughes 	case DOT:
1004*9a7741deSElliott Hughes 		lastatom = starttok;
1005*9a7741deSElliott Hughes 		rtok = relex();
1006*9a7741deSElliott Hughes 		return (unary(op2(DOT, NIL, NIL)));
1007*9a7741deSElliott Hughes 	case CCL:
1008*9a7741deSElliott Hughes 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1009*9a7741deSElliott Hughes 		lastatom = starttok;
1010*9a7741deSElliott Hughes 		rtok = relex();
1011*9a7741deSElliott Hughes 		return (unary(np));
1012*9a7741deSElliott Hughes 	case NCCL:
1013*9a7741deSElliott Hughes 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1014*9a7741deSElliott Hughes 		lastatom = starttok;
1015*9a7741deSElliott Hughes 		rtok = relex();
1016*9a7741deSElliott Hughes 		return (unary(np));
1017*9a7741deSElliott Hughes 	case '^':
1018*9a7741deSElliott Hughes 		rtok = relex();
1019*9a7741deSElliott Hughes 		return (unary(op2(CHAR, NIL, itonp(HAT))));
1020*9a7741deSElliott Hughes 	case '$':
1021*9a7741deSElliott Hughes 		rtok = relex();
1022*9a7741deSElliott Hughes 		return (unary(op2(CHAR, NIL, NIL)));
1023*9a7741deSElliott Hughes 	case '(':
1024*9a7741deSElliott Hughes 		lastatom = starttok;
1025*9a7741deSElliott Hughes 		savelastatom = starttok - basestr; /* Retain over recursion */
1026*9a7741deSElliott Hughes 		rtok = relex();
1027*9a7741deSElliott Hughes 		if (rtok == ')') {	/* special pleading for () */
1028*9a7741deSElliott Hughes 			rtok = relex();
1029*9a7741deSElliott Hughes 			return unary(op2(CCL, NIL, (Node *) cclenter("")));
1030*9a7741deSElliott Hughes 		}
1031*9a7741deSElliott Hughes 		np = regexp();
1032*9a7741deSElliott Hughes 		if (rtok == ')') {
1033*9a7741deSElliott Hughes 			lastatom = basestr + savelastatom; /* Restore */
1034*9a7741deSElliott Hughes 			rtok = relex();
1035*9a7741deSElliott Hughes 			return (unary(np));
1036*9a7741deSElliott Hughes 		}
1037*9a7741deSElliott Hughes 		else
1038*9a7741deSElliott Hughes 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1039*9a7741deSElliott Hughes 	default:
1040*9a7741deSElliott Hughes 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1041*9a7741deSElliott Hughes 	}
1042*9a7741deSElliott Hughes 	return 0;	/*NOTREACHED*/
1043*9a7741deSElliott Hughes }
1044*9a7741deSElliott Hughes 
concat(Node * np)1045*9a7741deSElliott Hughes Node *concat(Node *np)
1046*9a7741deSElliott Hughes {
1047*9a7741deSElliott Hughes 	switch (rtok) {
1048*9a7741deSElliott Hughes 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1049*9a7741deSElliott Hughes 		return (concat(op2(CAT, np, primary())));
1050*9a7741deSElliott Hughes 	case EMPTYRE:
1051*9a7741deSElliott Hughes 		rtok = relex();
1052*9a7741deSElliott Hughes 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1053*9a7741deSElliott Hughes 				primary())));
1054*9a7741deSElliott Hughes 	}
1055*9a7741deSElliott Hughes 	return (np);
1056*9a7741deSElliott Hughes }
1057*9a7741deSElliott Hughes 
alt(Node * np)1058*9a7741deSElliott Hughes Node *alt(Node *np)
1059*9a7741deSElliott Hughes {
1060*9a7741deSElliott Hughes 	if (rtok == OR) {
1061*9a7741deSElliott Hughes 		rtok = relex();
1062*9a7741deSElliott Hughes 		return (alt(op2(OR, np, concat(primary()))));
1063*9a7741deSElliott Hughes 	}
1064*9a7741deSElliott Hughes 	return (np);
1065*9a7741deSElliott Hughes }
1066*9a7741deSElliott Hughes 
unary(Node * np)1067*9a7741deSElliott Hughes Node *unary(Node *np)
1068*9a7741deSElliott Hughes {
1069*9a7741deSElliott Hughes 	switch (rtok) {
1070*9a7741deSElliott Hughes 	case STAR:
1071*9a7741deSElliott Hughes 		rtok = relex();
1072*9a7741deSElliott Hughes 		return (unary(op2(STAR, np, NIL)));
1073*9a7741deSElliott Hughes 	case PLUS:
1074*9a7741deSElliott Hughes 		rtok = relex();
1075*9a7741deSElliott Hughes 		return (unary(op2(PLUS, np, NIL)));
1076*9a7741deSElliott Hughes 	case QUEST:
1077*9a7741deSElliott Hughes 		rtok = relex();
1078*9a7741deSElliott Hughes 		return (unary(op2(QUEST, np, NIL)));
1079*9a7741deSElliott Hughes 	case ZERO:
1080*9a7741deSElliott Hughes 		rtok = relex();
1081*9a7741deSElliott Hughes 		return (unary(op2(ZERO, np, NIL)));
1082*9a7741deSElliott Hughes 	default:
1083*9a7741deSElliott Hughes 		return (np);
1084*9a7741deSElliott Hughes 	}
1085*9a7741deSElliott Hughes }
1086*9a7741deSElliott Hughes 
1087*9a7741deSElliott Hughes /*
1088*9a7741deSElliott Hughes  * Character class definitions conformant to the POSIX locale as
1089*9a7741deSElliott Hughes  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1090*9a7741deSElliott Hughes  * and operating character sets are both ASCII (ISO646) or supersets
1091*9a7741deSElliott Hughes  * thereof.
1092*9a7741deSElliott Hughes  *
1093*9a7741deSElliott Hughes  * Note that to avoid overflowing the temporary buffer used in
1094*9a7741deSElliott Hughes  * relex(), the expanded character class (prior to range expansion)
1095*9a7741deSElliott Hughes  * must be less than twice the size of their full name.
1096*9a7741deSElliott Hughes  */
1097*9a7741deSElliott Hughes 
1098*9a7741deSElliott Hughes /* Because isblank doesn't show up in any of the header files on any
1099*9a7741deSElliott Hughes  * system i use, it's defined here.  if some other locale has a richer
1100*9a7741deSElliott Hughes  * definition of "blank", define HAS_ISBLANK and provide your own
1101*9a7741deSElliott Hughes  * version.
1102*9a7741deSElliott Hughes  * the parentheses here are an attempt to find a path through the maze
1103*9a7741deSElliott Hughes  * of macro definition and/or function and/or version provided.  thanks
1104*9a7741deSElliott Hughes  * to nelson beebe for the suggestion; let's see if it works everywhere.
1105*9a7741deSElliott Hughes  */
1106*9a7741deSElliott Hughes 
1107*9a7741deSElliott Hughes /* #define HAS_ISBLANK */
1108*9a7741deSElliott Hughes #ifndef HAS_ISBLANK
1109*9a7741deSElliott Hughes 
1110*9a7741deSElliott Hughes int (xisblank)(int c)
1111*9a7741deSElliott Hughes {
1112*9a7741deSElliott Hughes 	return c==' ' || c=='\t';
1113*9a7741deSElliott Hughes }
1114*9a7741deSElliott Hughes 
1115*9a7741deSElliott Hughes #endif
1116*9a7741deSElliott Hughes 
1117*9a7741deSElliott Hughes static const struct charclass {
1118*9a7741deSElliott Hughes 	const char *cc_name;
1119*9a7741deSElliott Hughes 	int cc_namelen;
1120*9a7741deSElliott Hughes 	int (*cc_func)(int);
1121*9a7741deSElliott Hughes } charclasses[] = {
1122*9a7741deSElliott Hughes 	{ "alnum",	5,	isalnum },
1123*9a7741deSElliott Hughes 	{ "alpha",	5,	isalpha },
1124*9a7741deSElliott Hughes #ifndef HAS_ISBLANK
1125*9a7741deSElliott Hughes 	{ "blank",	5,	xisblank },
1126*9a7741deSElliott Hughes #else
1127*9a7741deSElliott Hughes 	{ "blank",	5,	isblank },
1128*9a7741deSElliott Hughes #endif
1129*9a7741deSElliott Hughes 	{ "cntrl",	5,	iscntrl },
1130*9a7741deSElliott Hughes 	{ "digit",	5,	isdigit },
1131*9a7741deSElliott Hughes 	{ "graph",	5,	isgraph },
1132*9a7741deSElliott Hughes 	{ "lower",	5,	islower },
1133*9a7741deSElliott Hughes 	{ "print",	5,	isprint },
1134*9a7741deSElliott Hughes 	{ "punct",	5,	ispunct },
1135*9a7741deSElliott Hughes 	{ "space",	5,	isspace },
1136*9a7741deSElliott Hughes 	{ "upper",	5,	isupper },
1137*9a7741deSElliott Hughes 	{ "xdigit",	6,	isxdigit },
1138*9a7741deSElliott Hughes 	{ NULL,		0,	NULL },
1139*9a7741deSElliott Hughes };
1140*9a7741deSElliott Hughes 
1141*9a7741deSElliott Hughes #define REPEAT_SIMPLE		0
1142*9a7741deSElliott Hughes #define REPEAT_PLUS_APPENDED	1
1143*9a7741deSElliott Hughes #define REPEAT_WITH_Q		2
1144*9a7741deSElliott Hughes #define REPEAT_ZERO		3
1145*9a7741deSElliott Hughes 
1146*9a7741deSElliott Hughes static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)1147*9a7741deSElliott Hughes replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1148*9a7741deSElliott Hughes 	       int atomlen, int firstnum, int secondnum, int special_case)
1149*9a7741deSElliott Hughes {
1150*9a7741deSElliott Hughes 	int i, j;
1151*9a7741deSElliott Hughes 	uschar *buf = 0;
1152*9a7741deSElliott Hughes 	int ret = 1;
1153*9a7741deSElliott Hughes 	int init_q = (firstnum == 0);		/* first added char will be ? */
1154*9a7741deSElliott Hughes 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
1155*9a7741deSElliott Hughes 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
1156*9a7741deSElliott Hughes 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
1157*9a7741deSElliott Hughes 	int size = prefix_length +  suffix_length;
1158*9a7741deSElliott Hughes 
1159*9a7741deSElliott Hughes 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
1160*9a7741deSElliott Hughes 		size += atomlen*(firstnum-1);
1161*9a7741deSElliott Hughes 	}
1162*9a7741deSElliott Hughes 
1163*9a7741deSElliott Hughes 	/* Adjust size of buffer for special cases */
1164*9a7741deSElliott Hughes 	if (special_case == REPEAT_PLUS_APPENDED) {
1165*9a7741deSElliott Hughes 		size++;		/* for the final + */
1166*9a7741deSElliott Hughes 	} else if (special_case == REPEAT_WITH_Q) {
1167*9a7741deSElliott Hughes 		size += init_q + (atomlen+1)* (n_q_reps-init_q);
1168*9a7741deSElliott Hughes 	} else if (special_case == REPEAT_ZERO) {
1169*9a7741deSElliott Hughes 		size += 2;	/* just a null ERE: () */
1170*9a7741deSElliott Hughes 	}
1171*9a7741deSElliott Hughes 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
1172*9a7741deSElliott Hughes 		FATAL("out of space in reg expr %.10s..", lastre);
1173*9a7741deSElliott Hughes 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
1174*9a7741deSElliott Hughes 	j = prefix_length;
1175*9a7741deSElliott Hughes 	if (special_case == REPEAT_ZERO) {
1176*9a7741deSElliott Hughes 		j -= atomlen;
1177*9a7741deSElliott Hughes 		buf[j++] = '(';
1178*9a7741deSElliott Hughes 		buf[j++] = ')';
1179*9a7741deSElliott Hughes 	}
1180*9a7741deSElliott Hughes 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
1181*9a7741deSElliott Hughes 		memcpy(&buf[j], atom, atomlen);
1182*9a7741deSElliott Hughes 		j += atomlen;
1183*9a7741deSElliott Hughes 	}
1184*9a7741deSElliott Hughes 	if (special_case == REPEAT_PLUS_APPENDED) {
1185*9a7741deSElliott Hughes 		buf[j++] = '+';
1186*9a7741deSElliott Hughes 	} else if (special_case == REPEAT_WITH_Q) {
1187*9a7741deSElliott Hughes 		if (init_q)
1188*9a7741deSElliott Hughes 			buf[j++] = '?';
1189*9a7741deSElliott Hughes 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
1190*9a7741deSElliott Hughes 			memcpy(&buf[j], atom, atomlen);
1191*9a7741deSElliott Hughes 			j += atomlen;
1192*9a7741deSElliott Hughes 			buf[j++] = '?';
1193*9a7741deSElliott Hughes 		}
1194*9a7741deSElliott Hughes 	}
1195*9a7741deSElliott Hughes 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
1196*9a7741deSElliott Hughes 	j += suffix_length;
1197*9a7741deSElliott Hughes 	buf[j] = '\0';
1198*9a7741deSElliott Hughes 	/* free old basestr */
1199*9a7741deSElliott Hughes 	if (firstbasestr != basestr) {
1200*9a7741deSElliott Hughes 		if (basestr)
1201*9a7741deSElliott Hughes 			xfree(basestr);
1202*9a7741deSElliott Hughes 	}
1203*9a7741deSElliott Hughes 	basestr = buf;
1204*9a7741deSElliott Hughes 	prestr  = buf + prefix_length;
1205*9a7741deSElliott Hughes 	if (special_case == REPEAT_ZERO) {
1206*9a7741deSElliott Hughes 		prestr  -= atomlen;
1207*9a7741deSElliott Hughes 		ret++;
1208*9a7741deSElliott Hughes 	}
1209*9a7741deSElliott Hughes 	return ret;
1210*9a7741deSElliott Hughes }
1211*9a7741deSElliott Hughes 
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)1212*9a7741deSElliott Hughes static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1213*9a7741deSElliott Hughes 		  int atomlen, int firstnum, int secondnum)
1214*9a7741deSElliott Hughes {
1215*9a7741deSElliott Hughes 	/*
1216*9a7741deSElliott Hughes 	   In general, the repetition specifier or "bound" is replaced here
1217*9a7741deSElliott Hughes 	   by an equivalent ERE string, repeating the immediately previous atom
1218*9a7741deSElliott Hughes 	   and appending ? and + as needed. Note that the first copy of the
1219*9a7741deSElliott Hughes 	   atom is left in place, except in the special_case of a zero-repeat
1220*9a7741deSElliott Hughes 	   (i.e., {0}).
1221*9a7741deSElliott Hughes 	 */
1222*9a7741deSElliott Hughes 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1223*9a7741deSElliott Hughes 		if (firstnum < 2) {
1224*9a7741deSElliott Hughes 			/* 0 or 1: should be handled before you get here */
1225*9a7741deSElliott Hughes 			FATAL("internal error");
1226*9a7741deSElliott Hughes 		} else {
1227*9a7741deSElliott Hughes 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1228*9a7741deSElliott Hughes 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1229*9a7741deSElliott Hughes 		}
1230*9a7741deSElliott Hughes 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1231*9a7741deSElliott Hughes 		if (firstnum == 0) {	/* {0} or {0,0} */
1232*9a7741deSElliott Hughes 			/* This case is unusual because the resulting
1233*9a7741deSElliott Hughes 			   replacement string might actually be SMALLER than
1234*9a7741deSElliott Hughes 			   the original ERE */
1235*9a7741deSElliott Hughes 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1236*9a7741deSElliott Hughes 					firstnum, secondnum, REPEAT_ZERO);
1237*9a7741deSElliott Hughes 		} else {		/* (firstnum >= 1) */
1238*9a7741deSElliott Hughes 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1239*9a7741deSElliott Hughes 					firstnum, secondnum, REPEAT_SIMPLE);
1240*9a7741deSElliott Hughes 		}
1241*9a7741deSElliott Hughes 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1242*9a7741deSElliott Hughes 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1243*9a7741deSElliott Hughes 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1244*9a7741deSElliott Hughes 					firstnum, secondnum, REPEAT_WITH_Q);
1245*9a7741deSElliott Hughes 	} else {	/* Error - shouldn't be here (n>m) */
1246*9a7741deSElliott Hughes 		FATAL("internal error");
1247*9a7741deSElliott Hughes 	}
1248*9a7741deSElliott Hughes 	return 0;
1249*9a7741deSElliott Hughes }
1250*9a7741deSElliott Hughes 
relex(void)1251*9a7741deSElliott Hughes int relex(void)		/* lexical analyzer for reparse */
1252*9a7741deSElliott Hughes {
1253*9a7741deSElliott Hughes 	int c, n;
1254*9a7741deSElliott Hughes 	int cflag;
1255*9a7741deSElliott Hughes 	static uschar *buf = NULL;
1256*9a7741deSElliott Hughes 	static int bufsz = 100;
1257*9a7741deSElliott Hughes 	uschar *bp;
1258*9a7741deSElliott Hughes 	const struct charclass *cc;
1259*9a7741deSElliott Hughes 	int i;
1260*9a7741deSElliott Hughes 	int num, m;
1261*9a7741deSElliott Hughes 	bool commafound, digitfound;
1262*9a7741deSElliott Hughes 	const uschar *startreptok;
1263*9a7741deSElliott Hughes 	static int parens = 0;
1264*9a7741deSElliott Hughes 
1265*9a7741deSElliott Hughes rescan:
1266*9a7741deSElliott Hughes 	starttok = prestr;
1267*9a7741deSElliott Hughes 
1268*9a7741deSElliott Hughes 	if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1269*9a7741deSElliott Hughes 		prestr += n;
1270*9a7741deSElliott Hughes 		starttok = prestr;
1271*9a7741deSElliott Hughes 		return CHAR;
1272*9a7741deSElliott Hughes 	}
1273*9a7741deSElliott Hughes 
1274*9a7741deSElliott Hughes 	switch (c = *prestr++) {
1275*9a7741deSElliott Hughes 	case '|': return OR;
1276*9a7741deSElliott Hughes 	case '*': return STAR;
1277*9a7741deSElliott Hughes 	case '+': return PLUS;
1278*9a7741deSElliott Hughes 	case '?': return QUEST;
1279*9a7741deSElliott Hughes 	case '.': return DOT;
1280*9a7741deSElliott Hughes 	case '\0': prestr--; return '\0';
1281*9a7741deSElliott Hughes 	case '^':
1282*9a7741deSElliott Hughes 	case '$':
1283*9a7741deSElliott Hughes 		return c;
1284*9a7741deSElliott Hughes 	case '(':
1285*9a7741deSElliott Hughes 		parens++;
1286*9a7741deSElliott Hughes 		return c;
1287*9a7741deSElliott Hughes 	case ')':
1288*9a7741deSElliott Hughes 		if (parens) {
1289*9a7741deSElliott Hughes 			parens--;
1290*9a7741deSElliott Hughes 			return c;
1291*9a7741deSElliott Hughes 		}
1292*9a7741deSElliott Hughes 		/* unmatched close parenthesis; per POSIX, treat as literal */
1293*9a7741deSElliott Hughes 		rlxval = c;
1294*9a7741deSElliott Hughes 		return CHAR;
1295*9a7741deSElliott Hughes 	case '\\':
1296*9a7741deSElliott Hughes 		rlxval = quoted(&prestr);
1297*9a7741deSElliott Hughes 		return CHAR;
1298*9a7741deSElliott Hughes 	default:
1299*9a7741deSElliott Hughes 		rlxval = c;
1300*9a7741deSElliott Hughes 		return CHAR;
1301*9a7741deSElliott Hughes 	case '[':
1302*9a7741deSElliott Hughes 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1303*9a7741deSElliott Hughes 			FATAL("out of space in reg expr %.10s..", lastre);
1304*9a7741deSElliott Hughes 		bp = buf;
1305*9a7741deSElliott Hughes 		if (*prestr == '^') {
1306*9a7741deSElliott Hughes 			cflag = 1;
1307*9a7741deSElliott Hughes 			prestr++;
1308*9a7741deSElliott Hughes 		}
1309*9a7741deSElliott Hughes 		else
1310*9a7741deSElliott Hughes 			cflag = 0;
1311*9a7741deSElliott Hughes 		n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1312*9a7741deSElliott Hughes 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1313*9a7741deSElliott Hughes 			FATAL("out of space for reg expr %.10s...", lastre);
1314*9a7741deSElliott Hughes 		for (; ; ) {
1315*9a7741deSElliott Hughes 			if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1316*9a7741deSElliott Hughes 				for (i = 0; i < n; i++)
1317*9a7741deSElliott Hughes 					*bp++ = *prestr++;
1318*9a7741deSElliott Hughes 				continue;
1319*9a7741deSElliott Hughes 			}
1320*9a7741deSElliott Hughes 			if ((c = *prestr++) == '\\') {
1321*9a7741deSElliott Hughes 				*bp++ = '\\';
1322*9a7741deSElliott Hughes 				if ((c = *prestr++) == '\0')
1323*9a7741deSElliott Hughes 					FATAL("nonterminated character class %.20s...", lastre);
1324*9a7741deSElliott Hughes 				*bp++ = c;
1325*9a7741deSElliott Hughes 			/* } else if (c == '\n') { */
1326*9a7741deSElliott Hughes 			/* 	FATAL("newline in character class %.20s...", lastre); */
1327*9a7741deSElliott Hughes 			} else if (c == '[' && *prestr == ':') {
1328*9a7741deSElliott Hughes 				/* POSIX char class names, Dag-Erling Smorgrav, [email protected] */
1329*9a7741deSElliott Hughes 				for (cc = charclasses; cc->cc_name; cc++)
1330*9a7741deSElliott Hughes 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1331*9a7741deSElliott Hughes 						break;
1332*9a7741deSElliott Hughes 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1333*9a7741deSElliott Hughes 				    prestr[2 + cc->cc_namelen] == ']') {
1334*9a7741deSElliott Hughes 					prestr += cc->cc_namelen + 3;
1335*9a7741deSElliott Hughes 					/*
1336*9a7741deSElliott Hughes 					 * BUG: We begin at 1, instead of 0, since we
1337*9a7741deSElliott Hughes 					 * would otherwise prematurely terminate the
1338*9a7741deSElliott Hughes 					 * string for classes like [[:cntrl:]]. This
1339*9a7741deSElliott Hughes 					 * means that we can't match the NUL character,
1340*9a7741deSElliott Hughes 					 * not without first adapting the entire
1341*9a7741deSElliott Hughes 					 * program to track each string's length.
1342*9a7741deSElliott Hughes 					 */
1343*9a7741deSElliott Hughes 					for (i = 1; i <= UCHAR_MAX; i++) {
1344*9a7741deSElliott Hughes 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1345*9a7741deSElliott Hughes 						    FATAL("out of space for reg expr %.10s...", lastre);
1346*9a7741deSElliott Hughes 						if (cc->cc_func(i)) {
1347*9a7741deSElliott Hughes 							/* escape backslash */
1348*9a7741deSElliott Hughes 							if (i == '\\') {
1349*9a7741deSElliott Hughes 								*bp++ = '\\';
1350*9a7741deSElliott Hughes 								n++;
1351*9a7741deSElliott Hughes 							}
1352*9a7741deSElliott Hughes 
1353*9a7741deSElliott Hughes 							*bp++ = i;
1354*9a7741deSElliott Hughes 							n++;
1355*9a7741deSElliott Hughes 						}
1356*9a7741deSElliott Hughes 					}
1357*9a7741deSElliott Hughes 				} else
1358*9a7741deSElliott Hughes 					*bp++ = c;
1359*9a7741deSElliott Hughes 			} else if (c == '[' && *prestr == '.') {
1360*9a7741deSElliott Hughes 				char collate_char;
1361*9a7741deSElliott Hughes 				prestr++;
1362*9a7741deSElliott Hughes 				collate_char = *prestr++;
1363*9a7741deSElliott Hughes 				if (*prestr == '.' && prestr[1] == ']') {
1364*9a7741deSElliott Hughes 					prestr += 2;
1365*9a7741deSElliott Hughes 					/* Found it: map via locale TBD: for
1366*9a7741deSElliott Hughes 					   now, simply return this char.  This
1367*9a7741deSElliott Hughes 					   is sufficient to pass conformance
1368*9a7741deSElliott Hughes 					   test awk.ex 156
1369*9a7741deSElliott Hughes 					 */
1370*9a7741deSElliott Hughes 					if (*prestr == ']') {
1371*9a7741deSElliott Hughes 						prestr++;
1372*9a7741deSElliott Hughes 						rlxval = collate_char;
1373*9a7741deSElliott Hughes 						return CHAR;
1374*9a7741deSElliott Hughes 					}
1375*9a7741deSElliott Hughes 				}
1376*9a7741deSElliott Hughes 			} else if (c == '[' && *prestr == '=') {
1377*9a7741deSElliott Hughes 				char equiv_char;
1378*9a7741deSElliott Hughes 				prestr++;
1379*9a7741deSElliott Hughes 				equiv_char = *prestr++;
1380*9a7741deSElliott Hughes 				if (*prestr == '=' && prestr[1] == ']') {
1381*9a7741deSElliott Hughes 					prestr += 2;
1382*9a7741deSElliott Hughes 					/* Found it: map via locale TBD: for now
1383*9a7741deSElliott Hughes 					   simply return this char. This is
1384*9a7741deSElliott Hughes 					   sufficient to pass conformance test
1385*9a7741deSElliott Hughes 					   awk.ex 156
1386*9a7741deSElliott Hughes 					 */
1387*9a7741deSElliott Hughes 					if (*prestr == ']') {
1388*9a7741deSElliott Hughes 						prestr++;
1389*9a7741deSElliott Hughes 						rlxval = equiv_char;
1390*9a7741deSElliott Hughes 						return CHAR;
1391*9a7741deSElliott Hughes 					}
1392*9a7741deSElliott Hughes 				}
1393*9a7741deSElliott Hughes 			} else if (c == '\0') {
1394*9a7741deSElliott Hughes 				FATAL("nonterminated character class %.20s", lastre);
1395*9a7741deSElliott Hughes 			} else if (bp == buf) {	/* 1st char is special */
1396*9a7741deSElliott Hughes 				*bp++ = c;
1397*9a7741deSElliott Hughes 			} else if (c == ']') {
1398*9a7741deSElliott Hughes 				*bp++ = 0;
1399*9a7741deSElliott Hughes 				rlxstr = (uschar *) tostring((char *) buf);
1400*9a7741deSElliott Hughes 				if (cflag == 0)
1401*9a7741deSElliott Hughes 					return CCL;
1402*9a7741deSElliott Hughes 				else
1403*9a7741deSElliott Hughes 					return NCCL;
1404*9a7741deSElliott Hughes 			} else
1405*9a7741deSElliott Hughes 				*bp++ = c;
1406*9a7741deSElliott Hughes 		}
1407*9a7741deSElliott Hughes 		break;
1408*9a7741deSElliott Hughes 	case '{':
1409*9a7741deSElliott Hughes 		if (isdigit((int) *(prestr))) {
1410*9a7741deSElliott Hughes 			num = 0;	/* Process as a repetition */
1411*9a7741deSElliott Hughes 			n = -1; m = -1;
1412*9a7741deSElliott Hughes 			commafound = false;
1413*9a7741deSElliott Hughes 			digitfound = false;
1414*9a7741deSElliott Hughes 			startreptok = prestr-1;
1415*9a7741deSElliott Hughes 			/* Remember start of previous atom here ? */
1416*9a7741deSElliott Hughes 		} else {        	/* just a { char, not a repetition */
1417*9a7741deSElliott Hughes 			rlxval = c;
1418*9a7741deSElliott Hughes 			return CHAR;
1419*9a7741deSElliott Hughes                 }
1420*9a7741deSElliott Hughes 		for (; ; ) {
1421*9a7741deSElliott Hughes 			if ((c = *prestr++) == '}') {
1422*9a7741deSElliott Hughes 				if (commafound) {
1423*9a7741deSElliott Hughes 					if (digitfound) { /* {n,m} */
1424*9a7741deSElliott Hughes 						m = num;
1425*9a7741deSElliott Hughes 						if (m < n)
1426*9a7741deSElliott Hughes 							FATAL("illegal repetition expression: class %.20s",
1427*9a7741deSElliott Hughes 								lastre);
1428*9a7741deSElliott Hughes 						if (n == 0 && m == 1) {
1429*9a7741deSElliott Hughes 							return QUEST;
1430*9a7741deSElliott Hughes 						}
1431*9a7741deSElliott Hughes 					} else {	/* {n,} */
1432*9a7741deSElliott Hughes 						if (n == 0)
1433*9a7741deSElliott Hughes 							return STAR;
1434*9a7741deSElliott Hughes 						else if (n == 1)
1435*9a7741deSElliott Hughes 							return PLUS;
1436*9a7741deSElliott Hughes 					}
1437*9a7741deSElliott Hughes 				} else {
1438*9a7741deSElliott Hughes 					if (digitfound) { /* {n} same as {n,n} */
1439*9a7741deSElliott Hughes 						n = num;
1440*9a7741deSElliott Hughes 						m = num;
1441*9a7741deSElliott Hughes 					} else {	/* {} */
1442*9a7741deSElliott Hughes 						FATAL("illegal repetition expression: class %.20s",
1443*9a7741deSElliott Hughes 							lastre);
1444*9a7741deSElliott Hughes 					}
1445*9a7741deSElliott Hughes 				}
1446*9a7741deSElliott Hughes 				if (repeat(starttok, prestr-starttok, lastatom,
1447*9a7741deSElliott Hughes 					   startreptok - lastatom, n, m) > 0) {
1448*9a7741deSElliott Hughes 					if (n == 0 && m == 0) {
1449*9a7741deSElliott Hughes 						return ZERO;
1450*9a7741deSElliott Hughes 					}
1451*9a7741deSElliott Hughes 					/* must rescan input for next token */
1452*9a7741deSElliott Hughes 					goto rescan;
1453*9a7741deSElliott Hughes 				}
1454*9a7741deSElliott Hughes 				/* Failed to replace: eat up {...} characters
1455*9a7741deSElliott Hughes 				   and treat like just PLUS */
1456*9a7741deSElliott Hughes 				return PLUS;
1457*9a7741deSElliott Hughes 			} else if (c == '\0') {
1458*9a7741deSElliott Hughes 				FATAL("nonterminated character class %.20s",
1459*9a7741deSElliott Hughes 					lastre);
1460*9a7741deSElliott Hughes 			} else if (isdigit(c)) {
1461*9a7741deSElliott Hughes 				num = 10 * num + c - '0';
1462*9a7741deSElliott Hughes 				digitfound = true;
1463*9a7741deSElliott Hughes 			} else if (c == ',') {
1464*9a7741deSElliott Hughes 				if (commafound)
1465*9a7741deSElliott Hughes 					FATAL("illegal repetition expression: class %.20s",
1466*9a7741deSElliott Hughes 						lastre);
1467*9a7741deSElliott Hughes 				/* looking for {n,} or {n,m} */
1468*9a7741deSElliott Hughes 				commafound = true;
1469*9a7741deSElliott Hughes 				n = num;
1470*9a7741deSElliott Hughes 				digitfound = false; /* reset */
1471*9a7741deSElliott Hughes 				num = 0;
1472*9a7741deSElliott Hughes 			} else {
1473*9a7741deSElliott Hughes 				FATAL("illegal repetition expression: class %.20s",
1474*9a7741deSElliott Hughes 					lastre);
1475*9a7741deSElliott Hughes 			}
1476*9a7741deSElliott Hughes 		}
1477*9a7741deSElliott Hughes 		break;
1478*9a7741deSElliott Hughes 	}
1479*9a7741deSElliott Hughes }
1480*9a7741deSElliott Hughes 
cgoto(fa * f,int s,int c)1481*9a7741deSElliott Hughes int cgoto(fa *f, int s, int c)
1482*9a7741deSElliott Hughes {
1483*9a7741deSElliott Hughes 	int *p, *q;
1484*9a7741deSElliott Hughes 	int i, j, k;
1485*9a7741deSElliott Hughes 
1486*9a7741deSElliott Hughes 	/* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
1487*9a7741deSElliott Hughes 	while (f->accept >= maxsetvec) {	/* guessing here! */
1488*9a7741deSElliott Hughes 		resizesetvec(__func__);
1489*9a7741deSElliott Hughes 	}
1490*9a7741deSElliott Hughes 	for (i = 0; i <= f->accept; i++)
1491*9a7741deSElliott Hughes 		setvec[i] = 0;
1492*9a7741deSElliott Hughes 	setcnt = 0;
1493*9a7741deSElliott Hughes 	resize_state(f, s);
1494*9a7741deSElliott Hughes 	/* compute positions of gototab[s,c] into setvec */
1495*9a7741deSElliott Hughes 	p = f->posns[s];
1496*9a7741deSElliott Hughes 	for (i = 1; i <= *p; i++) {
1497*9a7741deSElliott Hughes 		if ((k = f->re[p[i]].ltype) != FINAL) {
1498*9a7741deSElliott Hughes 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1499*9a7741deSElliott Hughes 			 || (k == DOT && c != 0 && c != HAT)
1500*9a7741deSElliott Hughes 			 || (k == ALL && c != 0)
1501*9a7741deSElliott Hughes 			 || (k == EMPTYRE && c != 0)
1502*9a7741deSElliott Hughes 			 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1503*9a7741deSElliott Hughes 			 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1504*9a7741deSElliott Hughes 				q = f->re[p[i]].lfollow;
1505*9a7741deSElliott Hughes 				for (j = 1; j <= *q; j++) {
1506*9a7741deSElliott Hughes 					if (q[j] >= maxsetvec) {
1507*9a7741deSElliott Hughes 						resizesetvec(__func__);
1508*9a7741deSElliott Hughes 					}
1509*9a7741deSElliott Hughes 					if (setvec[q[j]] == 0) {
1510*9a7741deSElliott Hughes 						setcnt++;
1511*9a7741deSElliott Hughes 						setvec[q[j]] = 1;
1512*9a7741deSElliott Hughes 					}
1513*9a7741deSElliott Hughes 				}
1514*9a7741deSElliott Hughes 			}
1515*9a7741deSElliott Hughes 		}
1516*9a7741deSElliott Hughes 	}
1517*9a7741deSElliott Hughes 	/* determine if setvec is a previous state */
1518*9a7741deSElliott Hughes 	tmpset[0] = setcnt;
1519*9a7741deSElliott Hughes 	j = 1;
1520*9a7741deSElliott Hughes 	for (i = f->accept; i >= 0; i--)
1521*9a7741deSElliott Hughes 		if (setvec[i]) {
1522*9a7741deSElliott Hughes 			tmpset[j++] = i;
1523*9a7741deSElliott Hughes 		}
1524*9a7741deSElliott Hughes 	resize_state(f, f->curstat > s ? f->curstat : s);
1525*9a7741deSElliott Hughes 	/* tmpset == previous state? */
1526*9a7741deSElliott Hughes 	for (i = 1; i <= f->curstat; i++) {
1527*9a7741deSElliott Hughes 		p = f->posns[i];
1528*9a7741deSElliott Hughes 		if ((k = tmpset[0]) != p[0])
1529*9a7741deSElliott Hughes 			goto different;
1530*9a7741deSElliott Hughes 		for (j = 1; j <= k; j++)
1531*9a7741deSElliott Hughes 			if (tmpset[j] != p[j])
1532*9a7741deSElliott Hughes 				goto different;
1533*9a7741deSElliott Hughes 		/* setvec is state i */
1534*9a7741deSElliott Hughes 		if (c != HAT)
1535*9a7741deSElliott Hughes 			set_gototab(f, s, c, i);
1536*9a7741deSElliott Hughes 		return i;
1537*9a7741deSElliott Hughes 	  different:;
1538*9a7741deSElliott Hughes 	}
1539*9a7741deSElliott Hughes 
1540*9a7741deSElliott Hughes 	/* add tmpset to current set of states */
1541*9a7741deSElliott Hughes 	++(f->curstat);
1542*9a7741deSElliott Hughes 	resize_state(f, f->curstat);
1543*9a7741deSElliott Hughes 	clear_gototab(f, f->curstat);
1544*9a7741deSElliott Hughes 	xfree(f->posns[f->curstat]);
1545*9a7741deSElliott Hughes 	p = intalloc(setcnt + 1, __func__);
1546*9a7741deSElliott Hughes 
1547*9a7741deSElliott Hughes 	f->posns[f->curstat] = p;
1548*9a7741deSElliott Hughes 	if (c != HAT)
1549*9a7741deSElliott Hughes 		set_gototab(f, s, c, f->curstat);
1550*9a7741deSElliott Hughes 	for (i = 0; i <= setcnt; i++)
1551*9a7741deSElliott Hughes 		p[i] = tmpset[i];
1552*9a7741deSElliott Hughes 	if (setvec[f->accept])
1553*9a7741deSElliott Hughes 		f->out[f->curstat] = 1;
1554*9a7741deSElliott Hughes 	else
1555*9a7741deSElliott Hughes 		f->out[f->curstat] = 0;
1556*9a7741deSElliott Hughes 	return f->curstat;
1557*9a7741deSElliott Hughes }
1558*9a7741deSElliott Hughes 
1559*9a7741deSElliott Hughes 
freefa(fa * f)1560*9a7741deSElliott Hughes void freefa(fa *f)	/* free a finite automaton */
1561*9a7741deSElliott Hughes {
1562*9a7741deSElliott Hughes 	int i;
1563*9a7741deSElliott Hughes 
1564*9a7741deSElliott Hughes 	if (f == NULL)
1565*9a7741deSElliott Hughes 		return;
1566*9a7741deSElliott Hughes 	for (i = 0; i < f->state_count; i++)
1567*9a7741deSElliott Hughes 		xfree(f->gototab[i].entries);
1568*9a7741deSElliott Hughes 	xfree(f->gototab);
1569*9a7741deSElliott Hughes 	for (i = 0; i <= f->curstat; i++)
1570*9a7741deSElliott Hughes 		xfree(f->posns[i]);
1571*9a7741deSElliott Hughes 	for (i = 0; i <= f->accept; i++) {
1572*9a7741deSElliott Hughes 		xfree(f->re[i].lfollow);
1573*9a7741deSElliott Hughes 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1574*9a7741deSElliott Hughes 			xfree(f->re[i].lval.np);
1575*9a7741deSElliott Hughes 	}
1576*9a7741deSElliott Hughes 	xfree(f->restr);
1577*9a7741deSElliott Hughes 	xfree(f->out);
1578*9a7741deSElliott Hughes 	xfree(f->posns);
1579*9a7741deSElliott Hughes 	xfree(f->gototab);
1580*9a7741deSElliott Hughes 	xfree(f);
1581*9a7741deSElliott Hughes }
1582