xref: /aosp_15_r20/bionic/libc/upstream-netbsd/lib/libc/regex/regcomp.c (revision 8d67ca893c1523eb926b9080dbe4e2ffd2a27ba1)
1 /*	$NetBSD: regcomp.c,v 1.47 2022/12/21 17:44:15 wiz Exp $	*/
2 
3 /*-
4  * SPDX-License-Identifier: BSD-3-Clause
5  *
6  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
7  * Copyright (c) 1992, 1993, 1994
8  *	The Regents of the University of California.  All rights reserved.
9  *
10  * Copyright (c) 2011 The FreeBSD Foundation
11  * All rights reserved.
12  * Portions of this software were developed by David Chisnall
13  * under sponsorship from the FreeBSD Foundation.
14  *
15  * This code is derived from software contributed to Berkeley by
16  * Henry Spencer.
17  *
18  * Redistribution and use in source and binary forms, with or without
19  * modification, are permitted provided that the following conditions
20  * are met:
21  * 1. Redistributions of source code must retain the above copyright
22  *    notice, this list of conditions and the following disclaimer.
23  * 2. Redistributions in binary form must reproduce the above copyright
24  *    notice, this list of conditions and the following disclaimer in the
25  *    documentation and/or other materials provided with the distribution.
26  * 3. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
43  */
44 
45 #if HAVE_NBTOOL_CONFIG_H
46 #include "nbtool_config.h"
47 #endif
48 
49 #include <sys/cdefs.h>
50 #if 0
51 static char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
52 __FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $");
53 #endif
54 __RCSID("$NetBSD: regcomp.c,v 1.47 2022/12/21 17:44:15 wiz Exp $");
55 
56 #ifndef LIBHACK
57 #define REGEX_GNU_EXTENSIONS
58 
59 #include "namespace.h"
60 #endif
61 #include <sys/types.h>
62 #include <stdio.h>
63 #include <string.h>
64 #include <ctype.h>
65 #include <limits.h>
66 #include <stdlib.h>
67 #include <regex.h>
68 #include <stdbool.h>
69 
70 #if defined(__weak_alias) && !defined(LIBHACK)
71 __weak_alias(regcomp,_regcomp)
72 #endif
73 
74 #ifdef REGEX_LIBC_COLLATE
75 #include "collate.h"
76 #endif
77 
78 #include "utils.h"
79 #include "regex2.h"
80 
81 #include "cname.h"
82 
83 /*
84  * Branching context, used to keep track of branch state for all of the branch-
85  * aware functions. In addition to keeping track of branch positions for the
86  * p_branch_* functions, we use this to simplify some clumsiness in BREs for
87  * detection of whether ^ is acting as an anchor or being used erroneously and
88  * also for whether we're in a sub-expression or not.
89  */
90 struct branchc {
91 	sopno start;
92 	sopno back;
93 	sopno fwd;
94 
95 	int nbranch;
96 	int nchain;
97 	bool outer;
98 	bool terminate;
99 };
100 
101 /*
102  * parse structure, passed up and down to avoid global variables and
103  * other clumsinesses
104  */
105 struct parse {
106 	const char *next;	/* next character in RE */
107 	const char *end;	/* end of string (-> NUL normally) */
108 	int error;		/* has an error been seen? */
109 	int gnuext;
110 	sop *strip;		/* malloced strip */
111 	sopno ssize;		/* malloced strip size (allocated) */
112 	sopno slen;		/* malloced strip length (used) */
113 	size_t ncsalloc;	/* number of csets allocated */
114 	struct re_guts *g;
115 #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
116 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
117 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
118 	bool allowbranch;	/* can this expression branch? */
119 	bool bre;		/* convenience; is this a BRE? */
120 	int pflags;		/* other parsing flags -- legacy escapes? */
121 	bool (*parse_expr)(struct parse *, struct branchc *);
122 	void (*pre_parse)(struct parse *, struct branchc *);
123 	void (*post_parse)(struct parse *, struct branchc *);
124 };
125 
126 #define PFLAG_LEGACY_ESC	0x00000001
127 
128 /* ========= begin header generated by ./mkh ========= */
129 #ifdef __cplusplus
130 extern "C" {
131 #endif
132 
133 /* === regcomp.c === */
134 static bool p_ere_exp(struct parse *p, struct branchc *bc);
135 static void p_str(struct parse *p);
136 static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
137 static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
138 static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
139 static bool p_branch_empty(struct parse *p, struct branchc *bc);
140 static bool p_branch_do(struct parse *p, struct branchc *bc);
141 static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
142 static void p_bre_post_parse(struct parse *p, struct branchc *bc);
143 static void p_re(struct parse *p, int end1, int end2);
144 static bool p_simp_re(struct parse *p, struct branchc *bc);
145 static int p_count(struct parse *p);
146 static void p_bracket(struct parse *p);
147 static int p_range_cmp(wchar_t c1, wchar_t c2);
148 static void p_b_term(struct parse *p, cset *cs);
149 #ifdef REGEX_GNU_EXTENSIONS
150 static int p_b_pseudoclass(struct parse *p, char c);
151 #endif
152 static void p_b_cclass(struct parse *p, cset *cs);
153 static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
154 static void p_b_eclass(struct parse *p, cset *cs);
155 static wint_t p_b_symbol(struct parse *p);
156 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
157 static bool may_escape(struct parse *p, const wint_t ch);
158 static wint_t othercase(wint_t ch);
159 static void bothcases(struct parse *p, wint_t ch);
160 static void ordinary(struct parse *p, wint_t ch);
161 static void nonnewline(struct parse *p);
162 static void repeat(struct parse *p, sopno start, int from, int to);
163 static int seterr(struct parse *p, int e);
164 static cset *allocset(struct parse *p);
165 static void freeset(struct parse *p, cset *cs);
166 static void CHadd(struct parse *p, cset *cs, wint_t ch);
167 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
168 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
169 static wint_t singleton(cset *cs);
170 static sopno dupl(struct parse *p, sopno start, sopno finish);
171 static void doemit(struct parse *p, sop op, size_t opnd);
172 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
173 static void dofwd(struct parse *p, sopno pos, sop value);
174 static int enlarge(struct parse *p, sopno size);
175 static void stripsnug(struct parse *p, struct re_guts *g);
176 static void findmust(struct parse *p, struct re_guts *g);
177 static int altoffset(sop *scan, int offset);
178 static void computejumps(struct parse *p, struct re_guts *g);
179 static void computematchjumps(struct parse *p, struct re_guts *g);
180 static sopno pluscount(struct parse *p, struct re_guts *g);
181 static wint_t wgetnext(struct parse *p);
182 
183 #ifdef __cplusplus
184 }
185 #endif
186 /* ========= end header generated by ./mkh ========= */
187 
188 static char nuls[10];		/* place to point scanner in event of error */
189 
190 /*
191  * macros for use with parse structure
192  * BEWARE:  these know that the parse structure is named `p' !!!
193  */
194 #define	PEEK()	(*p->next)
195 #define	PEEK2()	(*(p->next+1))
196 #define	MORE()	(p->next < p->end)
197 #define	MORE2()	(p->next+1 < p->end)
198 #define	SEE(c)	(MORE() && PEEK() == (c))
199 #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
200 #define	SEESPEC(a)	(p->bre ? SEETWO('\\', a) : SEE(a))
201 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
202 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
203 #define	EATSPEC(a)	(p->bre ? EATTWO('\\', a) : EAT(a))
204 #define	NEXT()	(p->next++)
205 #define	NEXT2()	(p->next += 2)
206 #define	NEXTn(n)	(p->next += (n))
207 #define	GETNEXT()	(*p->next++)
208 #define	WGETNEXT()	wgetnext(p)
209 #define	SETERROR(e)	seterr(p, (e))
210 #define	REQUIRE(co, e)	((co) || SETERROR(e))
211 #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
212 #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
213 #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
214 #define	EMIT(op, sopnd)	doemit(p, (op), (sopnd))
215 #define	INSERT(op, pos)	doinsert(p, (op), HERE()-(pos)+1, pos)
216 #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
217 #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
218 #define	HERE()		(p->slen)
219 #define	THERE()		(p->slen - 1)
220 #define	THERETHERE()	(p->slen - 2)
221 #define	DROP(n)	(p->slen -= (n))
222 
223 /* Macro used by computejump()/computematchjump() */
224 #ifndef MIN
225 #define MIN(a,b)	((a)<(b)?(a):(b))
226 #endif
227 
228 #ifndef NLS
229 static const struct {
230 	const char *name;
231 	int (*func)(int);
232 } wctypes[] = {
233 #define ADD(x) { .name = # x, .func = is ## x }
234 	ADD(alnum),
235 	ADD(alpha),
236 	ADD(blank),
237 	ADD(cntrl),
238 	ADD(digit),
239 	ADD(graph),
240 	ADD(lower),
241 	ADD(print),
242 	ADD(punct),
243 	ADD(space),
244 	ADD(upper),
245 	ADD(xdigit),
246 #undef ADD
247 };
248 
249 wctype_t
__regex_wctype(const char * str)250 __regex_wctype(const char *str)
251 {
252 	for (size_t i = 0; i < __arraycount(wctypes); i++) {
253 		if (strcmp(wctypes[i].name, str) == 0)
254 			return (wctype_t)(i + 1);
255 	}
256 	return (wctype_t)0;
257 }
258 
259 int
__regex_iswctype(wint_t c,wctype_t ct)260 __regex_iswctype(wint_t c, wctype_t ct)
261 {
262 	if (ct == 0)
263 		return 0;
264 	return (*wctypes[ct - 1].func)(c);
265 }
266 #endif
267 
268 static int				/* 0 success, otherwise REG_something */
regcomp_internal(regex_t * __restrict preg,const char * __restrict pattern,int cflags,int pflags)269 regcomp_internal(regex_t * __restrict preg,
270 	const char * __restrict pattern,
271 	int cflags, int pflags)
272 {
273 	struct parse pa;
274 	struct re_guts *g;
275 	struct parse *p = &pa;
276 	int i;
277 	size_t len;
278 	size_t maxlen;
279 #ifdef REDEBUG
280 #	define	GOODFLAGS(f)	(f)
281 #else
282 #	define	GOODFLAGS(f)	((f)&~REG_DUMP)
283 #endif
284 
285 	_DIAGASSERT(preg != NULL);
286 	_DIAGASSERT(pattern != NULL);
287 
288 	cflags = GOODFLAGS(cflags);
289 	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
290 		return(REG_INVARG);
291 
292 	if (cflags&REG_PEND) {
293 		if (preg->re_endp < pattern)
294 			return(REG_INVARG);
295 		len = preg->re_endp - pattern;
296 	} else
297 		len = strlen(pattern);
298 
299 	/* do the mallocs early so failure handling is easy */
300 	g = malloc(sizeof(*g));
301 	if (g == NULL)
302 		return(REG_ESPACE);
303 	/*
304 	 * Limit the pattern space to avoid a 32-bit overflow on buffer
305 	 * extension.  Also avoid any signed overflow in case of conversion
306 	 * so make the real limit based on a 31-bit overflow.
307 	 *
308 	 * Likely not applicable on 64-bit systems but handle the case
309 	 * generically (who are we to stop people from using ~715MB+
310 	 * patterns?).
311 	 */
312 	maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3;
313 	if (len >= maxlen) {
314 		free(g);
315 		return(REG_ESPACE);
316 	}
317 	p->ssize = (sopno)(len / 2 * 3 + 1);	/* ugh */
318 	assert(p->ssize >= len);
319 
320 	p->strip = calloc(p->ssize, sizeof(*p->strip));
321 	p->slen = 0;
322 	if (p->strip == NULL) {
323 		free(g);
324 		return(REG_ESPACE);
325 	}
326 
327 	/* set things up */
328 	p->g = g;
329 	p->next = pattern;	/* convenience; we do not modify it */
330 	p->end = p->next + len;
331 	p->error = 0;
332 	p->ncsalloc = 0;
333 	p->pflags = pflags;
334 	for (i = 0; i < NPAREN; i++) {
335 		p->pbegin[i] = 0;
336 		p->pend[i] = 0;
337 	}
338 #ifdef REGEX_GNU_EXTENSIONS
339 	if ((cflags & REG_GNU) == 0) {
340 		p->gnuext = false;
341 		p->allowbranch = (cflags & REG_EXTENDED) != 0;
342 	} else
343 		p->gnuext = p->allowbranch = true;
344 #else
345 	p->gnuext = false;
346 	p->allowbranch = (cflags & REG_EXTENDED) != 0;
347 #endif
348 	if (cflags & REG_EXTENDED) {
349 		p->bre = false;
350 		p->parse_expr = p_ere_exp;
351 		p->pre_parse = NULL;
352 		p->post_parse = NULL;
353 	} else {
354 		p->bre = true;
355 		p->parse_expr = p_simp_re;
356 		p->pre_parse = p_bre_pre_parse;
357 		p->post_parse = p_bre_post_parse;
358 	}
359 	g->sets = NULL;
360 	g->ncsets = 0;
361 	g->cflags = cflags;
362 	g->iflags = 0;
363 	g->nbol = 0;
364 	g->neol = 0;
365 	g->must = NULL;
366 	g->moffset = -1;
367 	g->charjump = NULL;
368 	g->matchjump = NULL;
369 	g->mlen = 0;
370 	g->nsub = 0;
371 	g->backrefs = 0;
372 
373 	/* do it */
374 	EMIT(OEND, 0);
375 	g->firststate = THERE();
376 	if (cflags & REG_NOSPEC)
377 		p_str(p);
378 	else
379 		p_re(p, OUT, OUT);
380 	EMIT(OEND, 0);
381 	g->laststate = THERE();
382 
383 	/* tidy up loose ends and fill things in */
384 	stripsnug(p, g);
385 	findmust(p, g);
386 	/* only use Boyer-Moore algorithm if the pattern is bigger
387 	 * than three characters
388 	 */
389 	if(g->mlen > 3) {
390 		computejumps(p, g);
391 		computematchjumps(p, g);
392 		if(g->matchjump == NULL && g->charjump != NULL) {
393 			free(g->charjump);
394 			g->charjump = NULL;
395 		}
396 	}
397 	g->nplus = pluscount(p, g);
398 	g->magic = MAGIC2;
399 	preg->re_nsub = g->nsub;
400 	preg->re_g = g;
401 	preg->re_magic = MAGIC1;
402 #ifndef REDEBUG
403 	/* not debugging, so can't rely on the assert() in regexec() */
404 	if (g->iflags&BAD)
405 		SETERROR(REG_ASSERT);
406 #endif
407 
408 	/* win or lose, we're done */
409 	if (p->error != 0)	/* lose */
410 		regfree(preg);
411 	return(p->error);
412 }
413 
414 /*
415  - regcomp - interface for parser and compilation
416  = extern int regcomp(regex_t *, const char *, int);
417  = #define	REG_BASIC	0000
418  = #define	REG_EXTENDED	0001
419  = #define	REG_ICASE	0002
420  = #define	REG_NOSUB	0004
421  = #define	REG_NEWLINE	0010
422  = #define	REG_NOSPEC	0020
423  = #define	REG_PEND	0040
424  = #define	REG_DUMP	0200
425  */
426 int				/* 0 success, otherwise REG_something */
regcomp(regex_t * __restrict preg,const char * __restrict pattern,int cflags)427 regcomp(regex_t * __restrict preg,
428 	const char * __restrict pattern,
429 	int cflags)
430 {
431 
432 	return (regcomp_internal(preg, pattern, cflags, 0));
433 }
434 
435 /*
436  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
437  - return whether we should terminate or not
438  == static bool p_ere_exp(struct parse *p);
439  */
440 static bool
p_ere_exp(struct parse * p,struct branchc * bc)441 p_ere_exp(struct parse *p, struct branchc *bc)
442 {
443 	char c;
444 	wint_t wc;
445 	sopno pos;
446 	int count;
447 	int count2;
448 #ifdef REGEX_GNU_EXTENSIONS
449 	size_t i;
450 	int handled;
451 #endif
452 	sopno subno;
453 	int wascaret = 0;
454 
455 	_DIAGASSERT(p != NULL);
456 
457 	(void)bc;
458 	assert(MORE());		/* caller should have ensured this */
459 	c = GETNEXT();
460 
461 #ifdef REGEX_GNU_EXTENSIONS
462 	handled = 0;
463 #endif
464 	pos = HERE();
465 	switch (c) {
466 	case '(':
467 		(void)REQUIRE(MORE(), REG_EPAREN);
468 		p->g->nsub++;
469 		subno = (sopno)p->g->nsub;
470 		if (subno < NPAREN)
471 			p->pbegin[subno] = HERE();
472 		EMIT(OLPAREN, subno);
473 		if (!SEE(')'))
474 			p_re(p, ')', IGN);
475 		if (subno < NPAREN) {
476 			p->pend[subno] = HERE();
477 			assert(p->pend[subno] != 0);
478 		}
479 		EMIT(ORPAREN, subno);
480 		(void)MUSTEAT(')', REG_EPAREN);
481 		break;
482 #ifndef POSIX_MISTAKE
483 	case ')':		/* happens only if no current unmatched ( */
484 		/*
485 		 * You may ask, why the ifndef?  Because I didn't notice
486 		 * this until slightly too late for 1003.2, and none of the
487 		 * other 1003.2 regular-expression reviewers noticed it at
488 		 * all.  So an unmatched ) is legal POSIX, at least until
489 		 * we can get it fixed.
490 		 */
491 		SETERROR(REG_EPAREN);
492 		break;
493 #endif
494 	case '^':
495 		EMIT(OBOL, 0);
496 		p->g->iflags |= USEBOL;
497 		p->g->nbol++;
498 		wascaret = 1;
499 		break;
500 	case '$':
501 		EMIT(OEOL, 0);
502 		p->g->iflags |= USEEOL;
503 		p->g->neol++;
504 		break;
505 	case '|':
506 		SETERROR(REG_EMPTY);
507 		break;
508 	case '*':
509 	case '+':
510 	case '?':
511 	case '{':
512 		SETERROR(REG_BADRPT);
513 		break;
514 	case '.':
515 		if (p->g->cflags&REG_NEWLINE)
516 			nonnewline(p);
517 		else
518 			EMIT(OANY, 0);
519 		break;
520 	case '[':
521 		p_bracket(p);
522 		break;
523 	case '\\':
524 		(void)REQUIRE(MORE(), REG_EESCAPE);
525 		wc = WGETNEXT();
526 #ifdef REGEX_GNU_EXTENSIONS
527 		if (p->gnuext) {
528 			handled = 1;
529 			switch (wc) {
530 			case '`':
531 				EMIT(OBOS, 0);
532 				break;
533 			case '\'':
534 				EMIT(OEOS, 0);
535 				break;
536 			case 'B':
537 				EMIT(ONWBND, 0);
538 				break;
539 			case 'b':
540 				EMIT(OWBND, 0);
541 				break;
542 			case 'W':
543 			case 'w':
544 			case 'S':
545 			case 's':
546 				p_b_pseudoclass(p, wc);
547 				break;
548 			case 'a':
549 				ordinary(p, '\a');
550 				break;
551 			case 'e':
552 				ordinary(p, '\e');
553 				break;
554 			case 'f':
555 				ordinary(p, '\f');
556 				break;
557 			case 'n':
558 				ordinary(p, '\n');
559 				break;
560 			case 'r':
561 				ordinary(p, '\r');
562 				break;
563 			case 't':
564 				ordinary(p, '\t');
565 				break;
566 			case 'v':
567 				ordinary(p, '\v');
568 				break;
569 			case '1':
570 			case '2':
571 			case '3':
572 			case '4':
573 			case '5':
574 			case '6':
575 			case '7':
576 			case '8':
577 			case '9':
578 				i = wc - '0';
579 				assert(i < NPAREN);
580 				if (p->pend[i] != 0) {
581 					assert(i <= p->g->nsub);
582 					EMIT(OBACK_, i);
583 					assert(p->pbegin[i] != 0);
584 					assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
585 					assert(OP(p->strip[p->pend[i]]) == ORPAREN);
586 					(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
587 					EMIT(O_BACK, i);
588 				} else
589 					SETERROR(REG_ESUBREG);
590 				p->g->backrefs = 1;
591 				break;
592 			default:
593 				handled = 0;
594 			}
595 			/* Don't proceed to the POSIX bits if we've already handled it */
596 			if (handled)
597 				break;
598 		}
599 #endif
600 		switch (wc) {
601 		case '<':
602 			EMIT(OBOW, 0);
603 			break;
604 		case '>':
605 			EMIT(OEOW, 0);
606 			break;
607 		default:
608 			if (may_escape(p, wc))
609 				ordinary(p, wc);
610 			else
611 				SETERROR(REG_EESCAPE);
612 			break;
613 		}
614 		break;
615 	default:
616 		if (p->error != 0)
617 			return (false);
618 		p->next--;
619 		wc = WGETNEXT();
620 		ordinary(p, wc);
621 		break;
622 	}
623 
624 	if (!MORE())
625 		return (false);
626 	c = PEEK();
627 	/* we call { a repetition if followed by a digit */
628 	if (!( c == '*' || c == '+' || c == '?' || c == '{'))
629 		return (false);		/* no repetition, we're done */
630 	else if (c == '{')
631 		(void)REQUIRE(MORE2() && \
632 		    (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
633 	NEXT();
634 
635 	(void)REQUIRE(!wascaret, REG_BADRPT);
636 	switch (c) {
637 	case '*':	/* implemented as +? */
638 		/* this case does not require the (y|) trick, noKLUDGE */
639 		INSERT(OPLUS_, pos);
640 		ASTERN(O_PLUS, pos);
641 		INSERT(OQUEST_, pos);
642 		ASTERN(O_QUEST, pos);
643 		break;
644 	case '+':
645 		INSERT(OPLUS_, pos);
646 		ASTERN(O_PLUS, pos);
647 		break;
648 	case '?':
649 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
650 		INSERT(OCH_, pos);		/* offset slightly wrong */
651 		ASTERN(OOR1, pos);		/* this one's right */
652 		AHEAD(pos);			/* fix the OCH_ */
653 		EMIT(OOR2, 0);			/* offset very wrong... */
654 		AHEAD(THERE());			/* ...so fix it */
655 		ASTERN(O_CH, THERETHERE());
656 		break;
657 	case '{':
658 		count = p_count(p);
659 		if (EAT(',')) {
660 			if (isdigit((uch)PEEK())) {
661 				count2 = p_count(p);
662 				(void)REQUIRE(count <= count2, REG_BADBR);
663 			} else		/* single number with comma */
664 				count2 = INFINITY;
665 		} else		/* just a single number */
666 			count2 = count;
667 		repeat(p, pos, count, count2);
668 		if (!EAT('}')) {	/* error heuristics */
669 			while (MORE() && PEEK() != '}')
670 				NEXT();
671 			(void)REQUIRE(MORE(), REG_EBRACE);
672 			SETERROR(REG_BADBR);
673 		}
674 		break;
675 	}
676 
677 	if (!MORE())
678 		return (false);
679 	c = PEEK();
680 	if (!( c == '*' || c == '+' || c == '?' ||
681 				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
682 		return (false);
683 	SETERROR(REG_BADRPT);
684 	return (false);
685 }
686 
687 /*
688  - p_str - string (no metacharacters) "parser"
689  == static void p_str(struct parse *p);
690  */
691 static void
p_str(struct parse * p)692 p_str(struct parse *p)
693 {
694 	(void)REQUIRE(MORE(), REG_EMPTY);
695 	while (MORE())
696 		ordinary(p, WGETNEXT());
697 }
698 
699 /*
700  * Eat consecutive branch delimiters for the kind of expression that we are
701  * parsing, return the number of delimiters that we ate.
702  */
703 static int
p_branch_eat_delim(struct parse * p,struct branchc * bc)704 p_branch_eat_delim(struct parse *p, struct branchc *bc)
705 {
706 	int nskip;
707 
708 	(void)bc;
709 	nskip = 0;
710 	while (EATSPEC('|'))
711 		++nskip;
712 	return (nskip);
713 }
714 
715 /*
716  * Insert necessary branch book-keeping operations. This emits a
717  * bogus 'next' offset, since we still have more to parse
718  */
719 static void
p_branch_ins_offset(struct parse * p,struct branchc * bc)720 p_branch_ins_offset(struct parse *p, struct branchc *bc)
721 {
722 
723 	if (bc->nbranch == 0) {
724 		INSERT(OCH_, bc->start);	/* offset is wrong */
725 		bc->fwd = bc->start;
726 		bc->back = bc->start;
727 	}
728 
729 	ASTERN(OOR1, bc->back);
730 	bc->back = THERE();
731 	AHEAD(bc->fwd);			/* fix previous offset */
732 	bc->fwd = HERE();
733 	EMIT(OOR2, 0);			/* offset is very wrong */
734 	++bc->nbranch;
735 }
736 
737 /*
738  * Fix the offset of the tail branch, if we actually had any branches.
739  * This is to correct the bogus placeholder offset that we use.
740  */
741 static void
p_branch_fix_tail(struct parse * p,struct branchc * bc)742 p_branch_fix_tail(struct parse *p, struct branchc *bc)
743 {
744 
745 	/* Fix bogus offset at the tail if we actually have branches */
746 	if (bc->nbranch > 0) {
747 		AHEAD(bc->fwd);
748 		ASTERN(O_CH, bc->back);
749 	}
750 }
751 
752 /*
753  * Signal to the parser that an empty branch has been encountered; this will,
754  * in the future, be used to allow for more permissive behavior with empty
755  * branches. The return value should indicate whether parsing may continue
756  * or not.
757  */
758 static bool
p_branch_empty(struct parse * p,struct branchc * bc)759 p_branch_empty(struct parse *p, struct branchc *bc)
760 {
761 
762 	(void)bc;
763 	SETERROR(REG_EMPTY);
764 	return (false);
765 }
766 
767 /*
768  * Take care of any branching requirements. This includes inserting the
769  * appropriate branching instructions as well as eating all of the branch
770  * delimiters until we either run out of pattern or need to parse more pattern.
771  */
772 static bool
p_branch_do(struct parse * p,struct branchc * bc)773 p_branch_do(struct parse *p, struct branchc *bc)
774 {
775 	int ate = 0;
776 
777 	ate = p_branch_eat_delim(p, bc);
778 	if (ate == 0)
779 		return (false);
780 	else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
781 		/*
782 		 * Halt parsing only if we have an empty branch and p_branch_empty
783 		 * indicates that we must not continue. In the future, this will not
784 		 * necessarily be an error.
785 		 */
786 		return (false);
787 	p_branch_ins_offset(p, bc);
788 
789 	return (true);
790 }
791 
792 static void
p_bre_pre_parse(struct parse * p,struct branchc * bc)793 p_bre_pre_parse(struct parse *p, struct branchc *bc)
794 {
795 
796 	(void)bc;
797 	/*
798 	 * Does not move cleanly into expression parser because of
799 	 * ordinary interpration of * at the beginning position of
800 	 * an expression.
801 	 */
802 	if (EAT('^')) {
803 		EMIT(OBOL, 0);
804 		p->g->iflags |= USEBOL;
805 		p->g->nbol++;
806 	}
807 }
808 
809 static void
p_bre_post_parse(struct parse * p,struct branchc * bc)810 p_bre_post_parse(struct parse *p, struct branchc *bc)
811 {
812 
813 	/* Expression is terminating due to EOL token */
814 	if (bc->terminate) {
815 		DROP(1);
816 		EMIT(OEOL, 0);
817 		p->g->iflags |= USEEOL;
818 		p->g->neol++;
819 	}
820 }
821 
822 /*
823  - p_re - Top level parser, concatenation and BRE anchoring
824  == static void p_re(struct parse *p, int end1, int end2);
825  * Giving end1 as OUT essentially eliminates the end1/end2 check.
826  *
827  * This implementation is a bit of a kludge, in that a trailing $ is first
828  * taken as an ordinary character and then revised to be an anchor.
829  * The amount of lookahead needed to avoid this kludge is excessive.
830  */
831 static void
p_re(struct parse * p,int end1,int end2)832 p_re(struct parse *p,
833 	int end1,	/* first terminating character */
834 	int end2)	/* second terminating character; ignored for EREs */
835 {
836 	struct branchc bc;
837 
838 	bc.nbranch = 0;
839 	if (end1 == OUT && end2 == OUT)
840 		bc.outer = true;
841 	else
842 		bc.outer = false;
843 #define	SEEEND()	(!p->bre ? SEE(end1) : SEETWO(end1, end2))
844 	for (;;) {
845 		bc.start = HERE();
846 		bc.nchain = 0;
847 		bc.terminate = false;
848 		if (p->pre_parse != NULL)
849 			p->pre_parse(p, &bc);
850 		while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
851 			bc.terminate = p->parse_expr(p, &bc);
852 			++bc.nchain;
853 		}
854 		if (p->post_parse != NULL)
855 			p->post_parse(p, &bc);
856 		(void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
857 #ifdef REGEX_GNU_EXTENSIONS
858 		if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc))
859 			break;
860 #endif
861 		if (!p->allowbranch)
862 			break;
863 		/*
864 		 * p_branch_do's return value indicates whether we should
865 		 * continue parsing or not. This is both for correctness and
866 		 * a slight optimization, because it will check if we've
867 		 * encountered an empty branch or the end of the string
868 		 * immediately following a branch delimiter.
869 		 */
870 		if (!p_branch_do(p, &bc))
871 			break;
872 	}
873 #undef SEE_END
874 	if (p->allowbranch)
875 		p_branch_fix_tail(p, &bc);
876 	assert(!MORE() || SEE(end1));
877 }
878 
879 /*
880  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
881  == static bool p_simp_re(struct parse *p, struct branchc *bc);
882  */
883 static bool			/* was the simple RE an unbackslashed $? */
p_simp_re(struct parse * p,struct branchc * bc)884 p_simp_re(struct parse *p, struct branchc *bc)
885 {
886 	int c;
887 	int cc;			/* convenient/control character */
888 	int count;
889 	int count2;
890 	sopno pos;
891 	bool handled;
892 	size_t i;
893 	wint_t wc;
894 	sopno subno;
895 #	define	BACKSL	(1<<CHAR_BIT)
896 
897 	pos = HERE();		/* repetition op, if any, covers from here */
898 	handled = false;
899 
900 	assert(MORE());		/* caller should have ensured this */
901 	c = GETNEXT();
902 	if (c == '\\') {
903 		(void)REQUIRE(MORE(), REG_EESCAPE);
904 		cc = GETNEXT();
905 		c = BACKSL | cc;
906 #ifdef REGEX_GNU_EXTENSIONS
907 		if (p->gnuext) {
908 			handled = true;
909 			switch (c) {
910 			case BACKSL|'`':
911 				EMIT(OBOS, 0);
912 				break;
913 			case BACKSL|'\'':
914 				EMIT(OEOS, 0);
915 				break;
916 			case BACKSL|'B':
917 				EMIT(ONWBND, 0);
918 				break;
919 			case BACKSL|'b':
920 				EMIT(OWBND, 0);
921 				break;
922 			case BACKSL|'W':
923 			case BACKSL|'w':
924 			case BACKSL|'S':
925 			case BACKSL|'s':
926 				p_b_pseudoclass(p, cc);
927 				break;
928 			case BACKSL|'a':
929 				ordinary(p, '\a');
930 				break;
931 			case BACKSL|'e':
932 				ordinary(p, '\e');
933 				break;
934 			case BACKSL|'f':
935 				ordinary(p, '\f');
936 				break;
937 			case BACKSL|'n':
938 				ordinary(p, '\n');
939 				break;
940 			case BACKSL|'r':
941 				ordinary(p, '\r');
942 				break;
943 			case BACKSL|'t':
944 				ordinary(p, '\t');
945 				break;
946 			case BACKSL|'v':
947 				ordinary(p, '\v');
948 				break;
949 			default:
950 				handled = false;
951 			}
952 		}
953 #endif
954 	}
955 	if (!handled) {
956 		switch (c) {
957 		case '.':
958 			if (p->g->cflags&REG_NEWLINE)
959 				nonnewline(p);
960 			else
961 				EMIT(OANY, 0);
962 			break;
963 		case '[':
964 			p_bracket(p);
965 			break;
966 		case BACKSL|'<':
967 			EMIT(OBOW, 0);
968 			break;
969 		case BACKSL|'>':
970 			EMIT(OEOW, 0);
971 			break;
972 		case BACKSL|'{':
973 			SETERROR(REG_BADRPT);
974 			break;
975 		case BACKSL|'(':
976 			p->g->nsub++;
977 			subno = (sopno)p->g->nsub;
978 			if (subno < NPAREN)
979 				p->pbegin[subno] = HERE();
980 			EMIT(OLPAREN, subno);
981 			/* the MORE here is an error heuristic */
982 			if (MORE() && !SEETWO('\\', ')'))
983 				p_re(p, '\\', ')');
984 			if (subno < NPAREN) {
985 				p->pend[subno] = HERE();
986 				assert(p->pend[subno] != 0);
987 			}
988 			EMIT(ORPAREN, subno);
989 			(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
990 			break;
991 		case BACKSL|')':	/* should not get here -- must be user */
992 			SETERROR(REG_EPAREN);
993 			break;
994 		case BACKSL|'1':
995 		case BACKSL|'2':
996 		case BACKSL|'3':
997 		case BACKSL|'4':
998 		case BACKSL|'5':
999 		case BACKSL|'6':
1000 		case BACKSL|'7':
1001 		case BACKSL|'8':
1002 		case BACKSL|'9':
1003 			i = (c&~BACKSL) - '0';
1004 			assert(i < NPAREN);
1005 			if (p->pend[i] != 0) {
1006 				assert(i <= p->g->nsub);
1007 				EMIT(OBACK_, i);
1008 				assert(p->pbegin[i] != 0);
1009 				assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
1010 				assert(OP(p->strip[p->pend[i]]) == ORPAREN);
1011 				(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
1012 				EMIT(O_BACK, i);
1013 			} else
1014 				SETERROR(REG_ESUBREG);
1015 			p->g->backrefs = 1;
1016 			break;
1017 		case '*':
1018 			/*
1019 			 * Ordinary if used as the first character beyond BOL anchor of
1020 			 * a (sub-)expression, counts as a bad repetition operator if it
1021 			 * appears otherwise.
1022 			 */
1023 			(void)REQUIRE(bc->nchain == 0, REG_BADRPT);
1024 			/* FALLTHROUGH */
1025 		default:
1026 			if (p->error != 0)
1027 				return (false);	/* Definitely not $... */
1028 			p->next--;
1029 			wc = WGETNEXT();
1030 			if ((c & BACKSL) == 0 || may_escape(p, wc))
1031 				ordinary(p, wc);
1032 			else
1033 				SETERROR(REG_EESCAPE);
1034 			break;
1035 		}
1036 	}
1037 
1038 	if (EAT('*')) {		/* implemented as +? */
1039 		/* this case does not require the (y|) trick, noKLUDGE */
1040 		INSERT(OPLUS_, pos);
1041 		ASTERN(O_PLUS, pos);
1042 		INSERT(OQUEST_, pos);
1043 		ASTERN(O_QUEST, pos);
1044 #ifdef REGEX_GNU_EXTENSIONS
1045 	} else if (p->gnuext && EATTWO('\\', '?')) {
1046 		INSERT(OQUEST_, pos);
1047 		ASTERN(O_QUEST, pos);
1048 	} else if (p->gnuext && EATTWO('\\', '+')) {
1049 		INSERT(OPLUS_, pos);
1050 		ASTERN(O_PLUS, pos);
1051 #endif
1052 	} else if (EATTWO('\\', '{')) {
1053 		count = p_count(p);
1054 		if (EAT(',')) {
1055 			if (MORE() && isdigit((uch)PEEK())) {
1056 				count2 = p_count(p);
1057 				(void)REQUIRE(count <= count2, REG_BADBR);
1058 			} else		/* single number with comma */
1059 				count2 = INFINITY;
1060 		} else		/* just a single number */
1061 			count2 = count;
1062 		repeat(p, pos, count, count2);
1063 		if (!EATTWO('\\', '}')) {	/* error heuristics */
1064 			while (MORE() && !SEETWO('\\', '}'))
1065 				NEXT();
1066 			(void)REQUIRE(MORE(), REG_EBRACE);
1067 			SETERROR(REG_BADBR);
1068 		}
1069 	} else if (c == '$')     /* $ (but not \$) ends it */
1070 		return (true);
1071 
1072 	return (false);
1073 }
1074 
1075 /*
1076  - p_count - parse a repetition count
1077  == static int p_count(struct parse *p);
1078  */
1079 static int			/* the value */
p_count(struct parse * p)1080 p_count(struct parse *p)
1081 {
1082 	int count = 0;
1083 	int ndigits = 0;
1084 
1085 	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
1086 		count = count*10 + (GETNEXT() - '0');
1087 		ndigits++;
1088 	}
1089 
1090 	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
1091 	return(count);
1092 }
1093 
1094 /*
1095  - p_bracket - parse a bracketed character list
1096  == static void p_bracket(struct parse *p);
1097  */
1098 static void
p_bracket(struct parse * p)1099 p_bracket(struct parse *p)
1100 {
1101 	cset *cs;
1102 	wint_t ch;
1103 
1104 	/* Dept of Truly Sickening Special-Case Kludges */
1105 	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
1106 		EMIT(OBOW, 0);
1107 		NEXTn(6);
1108 		return;
1109 	}
1110 	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
1111 		EMIT(OEOW, 0);
1112 		NEXTn(6);
1113 		return;
1114 	}
1115 
1116 	if ((cs = allocset(p)) == NULL)
1117 		return;
1118 
1119 	if (p->g->cflags&REG_ICASE)
1120 		cs->icase = 1;
1121 	if (EAT('^'))
1122 		cs->invert = 1;
1123 	if (EAT(']'))
1124 		CHadd(p, cs, ']');
1125 	else if (EAT('-'))
1126 		CHadd(p, cs, '-');
1127 	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
1128 		p_b_term(p, cs);
1129 	if (EAT('-'))
1130 		CHadd(p, cs, '-');
1131 	(void)MUSTEAT(']', REG_EBRACK);
1132 
1133 	if (p->error != 0)	/* don't mess things up further */
1134 		return;
1135 
1136 	if (cs->invert && p->g->cflags&REG_NEWLINE)
1137 		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
1138 
1139 	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
1140 		ordinary(p, ch);
1141 		freeset(p, cs);
1142 	} else
1143 		EMIT(OANYOF, (size_t)(cs - p->g->sets));
1144 }
1145 
1146 static int
p_range_cmp(wchar_t c1,wchar_t c2)1147 p_range_cmp(wchar_t c1, wchar_t c2)
1148 {
1149 #ifdef REGEX_LIBC_COLLATE
1150 	return __wcollate_range_cmp(c1, c2);
1151 #elif defined(NLS)
1152 	/* Copied from libc/collate __wcollate_range_cmp */
1153 	wchar_t s1[2], s2[2];
1154 
1155 	s1[0] = c1;
1156 	s1[1] = L'\0';
1157 	s2[0] = c2;
1158 	s2[1] = L'\0';
1159 	return wcscoll(s1, s2);
1160 #else
1161 	char s1[2], s2[2];
1162 
1163 	s1[0] = (char)c1;
1164 	s1[1] = '\0';
1165 	s2[0] = (char)c2;
1166 	s2[1] = '\0';
1167 	return strcoll(s1, s2);
1168 #endif
1169 }
1170 
1171 /*
1172  - p_b_term - parse one term of a bracketed character list
1173  == static void p_b_term(struct parse *p, cset *cs);
1174  */
1175 static void
p_b_term(struct parse * p,cset * cs)1176 p_b_term(struct parse *p, cset *cs)
1177 {
1178 	char c;
1179 	wint_t start, finish;
1180 	wint_t i;
1181 #ifdef REGEX_LIBC_COLLATE
1182 	struct xlocale_collate *table =
1183 		(struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
1184 #endif
1185 
1186 	_DIAGASSERT(p != NULL);
1187 	_DIAGASSERT(cs != NULL);
1188 
1189 	/* classify what we've got */
1190 	switch ((MORE()) ? PEEK() : '\0') {
1191 	case '[':
1192 		c = (MORE2()) ? PEEK2() : '\0';
1193 		break;
1194 	case '-':
1195 		SETERROR(REG_ERANGE);
1196 		return;			/* NOTE RETURN */
1197 	default:
1198 		c = '\0';
1199 		break;
1200 	}
1201 
1202 	switch (c) {
1203 	case ':':		/* character class */
1204 		NEXT2();
1205 		(void)REQUIRE(MORE(), REG_EBRACK);
1206 		c = PEEK();
1207 		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
1208 		p_b_cclass(p, cs);
1209 		(void)REQUIRE(MORE(), REG_EBRACK);
1210 		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
1211 		break;
1212 	case '=':		/* equivalence class */
1213 		NEXT2();
1214 		(void)REQUIRE(MORE(), REG_EBRACK);
1215 		c = PEEK();
1216 		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
1217 		p_b_eclass(p, cs);
1218 		(void)REQUIRE(MORE(), REG_EBRACK);
1219 		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
1220 		break;
1221 	default:		/* symbol, ordinary character, or range */
1222 		start = p_b_symbol(p);
1223 		if (SEE('-') && MORE2() && PEEK2() != ']') {
1224 			/* range */
1225 			NEXT();
1226 			if (EAT('-'))
1227 				finish = '-';
1228 			else
1229 				finish = p_b_symbol(p);
1230 		} else
1231 			finish = start;
1232 		if (start == finish)
1233 			CHadd(p, cs, start);
1234 		else {
1235 #ifdef REGEX_LIBC_COLLATE
1236 			if (table->__collate_load_error || MB_CUR_MAX > 1) {
1237 #else
1238 			if (MB_CUR_MAX > 1) {
1239 #endif
1240 				(void)REQUIRE(start <= finish, REG_ERANGE);
1241 				CHaddrange(p, cs, start, finish);
1242 			} else {
1243 				(void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
1244 				for (i = 0; i <= UCHAR_MAX; i++) {
1245 					if (p_range_cmp(start, i) <= 0 &&
1246 					    p_range_cmp(i, finish) <= 0 )
1247 						CHadd(p, cs, i);
1248 				}
1249 			}
1250 		}
1251 		break;
1252 	}
1253 }
1254 
1255 #ifdef REGEX_GNU_EXTENSIONS
1256 /*
1257  - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
1258  == static int p_b_pseudoclass(struct parse *p, char c)
1259  */
1260 static int
1261 p_b_pseudoclass(struct parse *p, char c) {
1262 	cset *cs;
1263 
1264 	if ((cs = allocset(p)) == NULL)
1265 		return(0);
1266 
1267 	if (p->g->cflags&REG_ICASE)
1268 		cs->icase = 1;
1269 
1270 	switch (c) {
1271 	case 'W':
1272 		cs->invert = 1;
1273 		/* FALLTHROUGH */
1274 	case 'w':
1275 		p_b_cclass_named(p, cs, "alnum");
1276 		break;
1277 	case 'S':
1278 		cs->invert = 1;
1279 		/* FALLTHROUGH */
1280 	case 's':
1281 		p_b_cclass_named(p, cs, "space");
1282 		break;
1283 	default:
1284 		return(0);
1285 	}
1286 
1287 	EMIT(OANYOF, (size_t)(cs - p->g->sets));
1288 	return(1);
1289 }
1290 #endif
1291 
1292 /*
1293  - p_b_cclass - parse a character-class name and deal with it
1294  == static void p_b_cclass(struct parse *p, cset *cs);
1295  */
1296 static void
1297 p_b_cclass(struct parse *p, cset *cs)
1298 {
1299 	const char *sp = p->next;
1300 	size_t len;
1301 	char clname[16];
1302 
1303 	while (MORE() && isalpha((uch)PEEK()))
1304 		NEXT();
1305 	len = p->next - sp;
1306 	if (len >= sizeof(clname) - 1) {
1307 		SETERROR(REG_ECTYPE);
1308 		return;
1309 	}
1310 	memcpy(clname, sp, len);
1311 	clname[len] = '\0';
1312 
1313 	p_b_cclass_named(p, cs, clname);
1314 }
1315 
1316 /*
1317  - p_b_cclass_named - deal with a named character class
1318  == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
1319  */
1320 static void
1321 p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
1322 	wctype_t wct;
1323 
1324 	if ((wct = wctype(clname)) == 0) {
1325 		SETERROR(REG_ECTYPE);
1326 		return;
1327 	}
1328 	CHaddtype(p, cs, wct);
1329 }
1330 
1331 /*
1332  - p_b_eclass - parse an equivalence-class name and deal with it
1333  == static void p_b_eclass(struct parse *p, cset *cs);
1334  *
1335  * This implementation is incomplete. xxx
1336  */
1337 static void
1338 p_b_eclass(struct parse *p, cset *cs)
1339 {
1340 	wint_t c;
1341 
1342 	_DIAGASSERT(p != NULL);
1343 	_DIAGASSERT(cs != NULL);
1344 
1345 	c = p_b_coll_elem(p, '=');
1346 	CHadd(p, cs, c);
1347 }
1348 
1349 /*
1350  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1351  == static wint_t p_b_symbol(struct parse *p);
1352  */
1353 static wint_t			/* value of symbol */
1354 p_b_symbol(struct parse *p)
1355 {
1356 	wint_t value;
1357 
1358 	_DIAGASSERT(p != NULL);
1359 
1360 	(void)REQUIRE(MORE(), REG_EBRACK);
1361 	if (!EATTWO('[', '.'))
1362 		return(WGETNEXT());
1363 
1364 	/* collating symbol */
1365 	value = p_b_coll_elem(p, '.');
1366 	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1367 	return(value);
1368 }
1369 
1370 /*
1371  - p_b_coll_elem - parse a collating-element name and look it up
1372  == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1373  */
1374 static wint_t			/* value of collating element */
1375 p_b_coll_elem(struct parse *p,
1376 	wint_t endc)		/* name ended by endc,']' */
1377 {
1378 	const char *sp = p->next;
1379 	struct cname *cp;
1380 	size_t len;
1381 
1382 	_DIAGASSERT(p != NULL);
1383 
1384 	while (MORE() && !SEETWO(endc, ']'))
1385 		NEXT();
1386 	if (!MORE()) {
1387 		SETERROR(REG_EBRACK);
1388 		return(0);
1389 	}
1390 	len = p->next - sp;
1391 	for (cp = cnames; cp->name != NULL; cp++)
1392 		if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1393 			return(cp->code);	/* known name */
1394 #ifdef NLS
1395 	mbstate_t mbs;
1396 	wchar_t wc;
1397 	size_t clen;
1398 
1399 	memset(&mbs, 0, sizeof(mbs));
1400 	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1401 		return (wc);			/* single character */
1402 	else if (clen == (size_t)-1 || clen == (size_t)-2)
1403 		SETERROR(REG_ILLSEQ);
1404 	else
1405 		SETERROR(REG_ECOLLATE);		/* neither */
1406 	return(0);
1407 #else
1408 	if (len == 1)
1409 		return *sp;    /* single character */
1410 	SETERROR(REG_ECOLLATE);                 /* neither */
1411 	return 0;
1412 #endif
1413 }
1414 
1415 /*
1416  - may_escape - determine whether 'ch' is escape-able in the current context
1417  == static int may_escape(struct parse *p, const wint_t ch)
1418  */
1419 static bool
1420 may_escape(struct parse *p, const wint_t ch)
1421 {
1422 
1423 	if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
1424 		return (true);
1425 	if (isalpha(ch) || ch == '\'' || ch == '`')
1426 		return (false);
1427 	return (true);
1428 #ifdef NOTYET
1429 	/*
1430 	 * Build a whitelist of characters that may be escaped to produce an
1431 	 * ordinary in the current context. This assumes that these have not
1432 	 * been otherwise interpreted as a special character. Escaping an
1433 	 * ordinary character yields undefined results according to
1434 	 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
1435 	 * advantage of this and use escaped ordinary characters to provide
1436 	 * special meaning, e.g. \b, \B, \w, \W, \s, \S.
1437 	 */
1438 	switch(ch) {
1439 	case '|':
1440 	case '+':
1441 	case '?':
1442 		/* The above characters may not be escaped in BREs */
1443 		if (!(p->g->cflags&REG_EXTENDED))
1444 			return (false);
1445 		/* Fallthrough */
1446 	case '(':
1447 	case ')':
1448 	case '{':
1449 	case '}':
1450 	case '.':
1451 	case '[':
1452 	case ']':
1453 	case '\\':
1454 	case '*':
1455 	case '^':
1456 	case '$':
1457 		return (true);
1458 	default:
1459 		return (false);
1460 	}
1461 #endif
1462 }
1463 
1464 /*
1465  - othercase - return the case counterpart of an alphabetic
1466  == static wint_t othercase(wint_t ch);
1467  */
1468 static wint_t			/* if no counterpart, return ch */
1469 othercase(wint_t ch)
1470 {
1471 	assert(iswalpha(ch));
1472 	if (iswupper(ch))
1473 		return(towlower(ch));
1474 	else if (iswlower(ch))
1475 		return(towupper(ch));
1476 	else			/* peculiar, but could happen */
1477 		return(ch);
1478 }
1479 
1480 /*
1481  - bothcases - emit a dualcase version of a two-case character
1482  == static void bothcases(struct parse *p, wint_t ch);
1483  *
1484  * Boy, is this implementation ever a kludge...
1485  */
1486 static void
1487 bothcases(struct parse *p, wint_t ch)
1488 {
1489 	const char *oldnext = p->next;
1490 	const char *oldend = p->end;
1491 	char bracket[3 + MB_LEN_MAX];
1492 	size_t n;
1493 
1494 	_DIAGASSERT(p != NULL);
1495 
1496 	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
1497 	p->next = bracket;
1498 #ifdef NLS
1499 	mbstate_t mbs;
1500 	memset(&mbs, 0, sizeof(mbs));
1501 	n = wcrtomb(bracket, ch, &mbs);
1502 	assert(n != (size_t)-1);
1503 #else
1504 	n = 0;
1505 	bracket[n++] = ch;
1506 #endif
1507 	bracket[n] = ']';
1508 	bracket[n + 1] = '\0';
1509 	p->end = bracket+n+1;
1510 	p_bracket(p);
1511 	assert(p->next == p->end);
1512 	p->next = oldnext;
1513 	p->end = oldend;
1514 }
1515 
1516 /*
1517  - ordinary - emit an ordinary character
1518  == static void ordinary(struct parse *p, wint_t ch);
1519  */
1520 static void
1521 ordinary(struct parse *p, wint_t ch)
1522 {
1523 	cset *cs;
1524 
1525 	_DIAGASSERT(p != NULL);
1526 
1527 	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1528 		bothcases(p, ch);
1529 	else if ((wint_t)(ch & OPDMASK) == ch)
1530 		EMIT(OCHAR, (size_t)ch);
1531 	else {
1532 		/*
1533 		 * Kludge: character is too big to fit into an OCHAR operand.
1534 		 * Emit a singleton set.
1535 		 */
1536 		if ((cs = allocset(p)) == NULL)
1537 			return;
1538 		CHadd(p, cs, ch);
1539 		EMIT(OANYOF, (size_t)(cs - p->g->sets));
1540 	}
1541 }
1542 
1543 /*
1544  - nonnewline - emit REG_NEWLINE version of OANY
1545  == static void nonnewline(struct parse *p);
1546  *
1547  * Boy, is this implementation ever a kludge...
1548  */
1549 static void
1550 nonnewline(struct parse *p)
1551 {
1552 	const char *oldnext = p->next;
1553 	const char *oldend = p->end;
1554 	char bracket[4];
1555 
1556 	_DIAGASSERT(p != NULL);
1557 
1558 	p->next = bracket;
1559 	p->end = bracket+3;
1560 	bracket[0] = '^';
1561 	bracket[1] = '\n';
1562 	bracket[2] = ']';
1563 	bracket[3] = '\0';
1564 	p_bracket(p);
1565 	assert(p->next == bracket+3);
1566 	p->next = oldnext;
1567 	p->end = oldend;
1568 }
1569 
1570 /*
1571  - repeat - generate code for a bounded repetition, recursively if needed
1572  == static void repeat(struct parse *p, sopno start, int from, int to);
1573  */
1574 static void
1575 repeat(struct parse *p,
1576 	sopno start,		/* operand from here to end of strip */
1577 	int from,		/* repeated from this number */
1578 	int to)			/* to this number of times (maybe INFINITY) */
1579 {
1580 	sopno finish = HERE();
1581 #	define	N	2
1582 #	define	INF	3
1583 #	define	REP(f, t)	((f)*8 + (t))
1584 #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1585 	sopno copy;
1586 
1587 	_DIAGASSERT(p != NULL);
1588 
1589 	if (p->error != 0)	/* head off possible runaway recursion */
1590 		return;
1591 
1592 	assert(from <= to);
1593 
1594 	switch (REP(MAP(from), MAP(to))) {
1595 	case REP(0, 0):			/* must be user doing this */
1596 		DROP(finish-start);	/* drop the operand */
1597 		break;
1598 	case REP(0, 1):			/* as x{1,1}? */
1599 	case REP(0, N):			/* as x{1,n}? */
1600 	case REP(0, INF):		/* as x{1,}? */
1601 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1602 		INSERT(OCH_, start);		/* offset is wrong... */
1603 		repeat(p, start+1, 1, to);
1604 		ASTERN(OOR1, start);
1605 		AHEAD(start);			/* ... fix it */
1606 		EMIT(OOR2, 0);
1607 		AHEAD(THERE());
1608 		ASTERN(O_CH, THERETHERE());
1609 		break;
1610 	case REP(1, 1):			/* trivial case */
1611 		/* done */
1612 		break;
1613 	case REP(1, N):			/* as x?x{1,n-1} */
1614 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1615 		INSERT(OCH_, start);
1616 		ASTERN(OOR1, start);
1617 		AHEAD(start);
1618 		EMIT(OOR2, 0);			/* offset very wrong... */
1619 		AHEAD(THERE());			/* ...so fix it */
1620 		ASTERN(O_CH, THERETHERE());
1621 		copy = dupl(p, start+1, finish+1);
1622 		assert(copy == finish+4);
1623 		repeat(p, copy, 1, to-1);
1624 		break;
1625 	case REP(1, INF):		/* as x+ */
1626 		INSERT(OPLUS_, start);
1627 		ASTERN(O_PLUS, start);
1628 		break;
1629 	case REP(N, N):			/* as xx{m-1,n-1} */
1630 		copy = dupl(p, start, finish);
1631 		repeat(p, copy, from-1, to-1);
1632 		break;
1633 	case REP(N, INF):		/* as xx{n-1,INF} */
1634 		copy = dupl(p, start, finish);
1635 		repeat(p, copy, from-1, to);
1636 		break;
1637 	default:			/* "can't happen" */
1638 		SETERROR(REG_ASSERT);	/* just in case */
1639 		break;
1640 	}
1641 }
1642 
1643 /*
1644  - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1645  - character from the parse struct, signals a REG_ILLSEQ error if the
1646  - character can't be converted. Returns the number of bytes consumed.
1647  */
1648 static wint_t
1649 wgetnext(struct parse *p)
1650 {
1651 #ifdef NLS
1652 	mbstate_t mbs;
1653 	wchar_t wc;
1654 	size_t n;
1655 
1656 	memset(&mbs, 0, sizeof(mbs));
1657 	n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs);
1658 	if (n == (size_t)-1 || n == (size_t)-2) {
1659 		SETERROR(REG_ILLSEQ);
1660 		return (0);
1661 	}
1662 	if (n == 0)
1663 		n = 1;
1664 	p->next += n;
1665 	return wc;
1666 #else
1667 	return *p->next++;
1668 #endif
1669 }
1670 
1671 /*
1672  - seterr - set an error condition
1673  == static int seterr(struct parse *p, int e);
1674  */
1675 static int			/* useless but makes type checking happy */
1676 seterr(struct parse *p, int e)
1677 {
1678 
1679 	_DIAGASSERT(p != NULL);
1680 
1681 	if (p->error == 0)	/* keep earliest error condition */
1682 		p->error = e;
1683 	p->next = nuls;		/* try to bring things to a halt */
1684 	p->end = nuls;
1685 	return(0);		/* make the return value well-defined */
1686 }
1687 
1688 /*
1689  - allocset - allocate a set of characters for []
1690  == static cset *allocset(struct parse *p);
1691  */
1692 static cset *
1693 allocset(struct parse *p)
1694 {
1695 	cset *cs, *ncs;
1696 
1697 	_DIAGASSERT(p != NULL);
1698 
1699 	ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
1700 	if (ncs == NULL) {
1701 		SETERROR(REG_ESPACE);
1702 		return (NULL);
1703 	}
1704 	p->g->sets = ncs;
1705 	cs = &p->g->sets[p->g->ncsets++];
1706 	memset(cs, 0, sizeof(*cs));
1707 
1708 	return(cs);
1709 }
1710 
1711 /*
1712  - freeset - free a now-unused set
1713  == static void freeset(struct parse *p, cset *cs);
1714  */
1715 static void
1716 freeset(struct parse *p, cset *cs)
1717 {
1718 	cset *top;
1719 
1720 	_DIAGASSERT(p != NULL);
1721 	_DIAGASSERT(cs != NULL);
1722 
1723 	top = &p->g->sets[p->g->ncsets];
1724 
1725 	free(cs->wides);
1726 	free(cs->ranges);
1727 	free(cs->types);
1728 	memset(cs, 0, sizeof(*cs));
1729 	if (cs == top-1)	/* recover only the easy case */
1730 		p->g->ncsets--;
1731 }
1732 
1733 /*
1734  - singleton - Determine whether a set contains only one character,
1735  - returning it if so, otherwise returning OUT.
1736  */
1737 static wint_t
1738 singleton(cset *cs)
1739 {
1740 	wint_t i, s, n;
1741 
1742 	for (i = n = 0; i < NC; i++)
1743 		if (CHIN(cs, i)) {
1744 			n++;
1745 			s = i;
1746 		}
1747 	if (n == 1)
1748 		return (s);
1749 	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1750 	    cs->icase == 0)
1751 		return (cs->wides[0]);
1752 	/* Don't bother handling the other cases. */
1753 	return (OUT);
1754 }
1755 
1756 /*
1757  - CHadd - add character to character set.
1758  */
1759 static void
1760 CHadd(struct parse *p, cset *cs, wint_t ch)
1761 {
1762 	wint_t nch, *newwides;
1763 
1764 	_DIAGASSERT(p != NULL);
1765 	_DIAGASSERT(cs != NULL);
1766 
1767 	assert(ch >= 0);
1768 	if (ch < NC)
1769 		cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7);
1770 	else {
1771 		newwides = reallocarray(cs->wides, cs->nwides + 1,
1772 		    sizeof(*cs->wides));
1773 		if (newwides == NULL) {
1774 			SETERROR(REG_ESPACE);
1775 			return;
1776 		}
1777 		cs->wides = newwides;
1778 		cs->wides[cs->nwides++] = ch;
1779 	}
1780 	if (cs->icase) {
1781 		if ((nch = towlower(ch)) < NC)
1782 			cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1783 		if ((nch = towupper(ch)) < NC)
1784 			cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1785 	}
1786 }
1787 
1788 /*
1789  - CHaddrange - add all characters in the range [min,max] to a character set.
1790  */
1791 static void
1792 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1793 {
1794 	crange *newranges;
1795 
1796 	_DIAGASSERT(p != NULL);
1797 	_DIAGASSERT(cs != NULL);
1798 
1799 	for (; min < NC && min <= max; min++)
1800 		CHadd(p, cs, min);
1801 	if (min >= max)
1802 		return;
1803 	newranges = reallocarray(cs->ranges, cs->nranges + 1,
1804 	    sizeof(*cs->ranges));
1805 	if (newranges == NULL) {
1806 		SETERROR(REG_ESPACE);
1807 		return;
1808 	}
1809 	cs->ranges = newranges;
1810 	cs->ranges[cs->nranges].min = min;
1811 	cs->ranges[cs->nranges].max = max;
1812 	cs->nranges++;
1813 }
1814 
1815 /*
1816  - CHaddtype - add all characters of a certain type to a character set.
1817  */
1818 static void
1819 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1820 {
1821 	wint_t i;
1822 	wctype_t *newtypes;
1823 
1824 	_DIAGASSERT(p != NULL);
1825 	_DIAGASSERT(cs != NULL);
1826 
1827 	for (i = 0; i < NC; i++)
1828 		if (iswctype(i, wct))
1829 			CHadd(p, cs, i);
1830 	newtypes = reallocarray(cs->types, cs->ntypes + 1,
1831 	    sizeof(*cs->types));
1832 	if (newtypes == NULL) {
1833 		SETERROR(REG_ESPACE);
1834 		return;
1835 	}
1836 	cs->types = newtypes;
1837 	cs->types[cs->ntypes++] = wct;
1838 }
1839 
1840 /*
1841  - dupl - emit a duplicate of a bunch of sops
1842  == static sopno dupl(struct parse *p, sopno start, sopno finish);
1843  */
1844 static sopno			/* start of duplicate */
1845 dupl(struct parse *p,
1846 	sopno start,		/* from here */
1847 	sopno finish)		/* to this less one */
1848 {
1849 	sopno ret = HERE();
1850 	sopno len = finish - start;
1851 
1852 	_DIAGASSERT(p != NULL);
1853 
1854 	assert(finish >= start);
1855 	if (len == 0)
1856 		return(ret);
1857 	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1858 		return(ret);
1859 	(void) memcpy(p->strip + p->slen,
1860 	    p->strip + start, len * sizeof(*p->strip));
1861 	p->slen += len;
1862 	return(ret);
1863 }
1864 
1865 /*
1866  - doemit - emit a strip operator
1867  == static void doemit(struct parse *p, sop op, size_t opnd);
1868  *
1869  * It might seem better to implement this as a macro with a function as
1870  * hard-case backup, but it's just too big and messy unless there are
1871  * some changes to the data structures.  Maybe later.
1872  */
1873 static void
1874 doemit(struct parse *p, sop op, size_t opnd)
1875 {
1876 	/* avoid making error situations worse */
1877 	if (p->error != 0)
1878 		return;
1879 
1880 	_DIAGASSERT(p != NULL);
1881 
1882 	/* deal with oversize operands ("can't happen", more or less) */
1883 	assert(opnd < 1<<OPSHIFT);
1884 
1885 	/* deal with undersized strip */
1886 	if (p->slen >= p->ssize)
1887 		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1888 			return;
1889 
1890 	/* finally, it's all reduced to the easy case */
1891 	p->strip[p->slen++] = (sopno)SOP(op, opnd);
1892 }
1893 
1894 /*
1895  - doinsert - insert a sop into the strip
1896  == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1897  */
1898 static void
1899 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1900 {
1901 	sopno sn;
1902 	sop s;
1903 	int i;
1904 
1905 	_DIAGASSERT(p != NULL);
1906 
1907 	/* avoid making error situations worse */
1908 	if (p->error != 0)
1909 		return;
1910 
1911 	sn = HERE();
1912 	EMIT(op, opnd);		/* do checks, ensure space */
1913 	assert(HERE() == sn+1);
1914 	s = p->strip[sn];
1915 
1916 	/* adjust paren pointers */
1917 	assert(pos > 0);
1918 	for (i = 1; i < NPAREN; i++) {
1919 		if (p->pbegin[i] >= pos) {
1920 			p->pbegin[i]++;
1921 		}
1922 		if (p->pend[i] >= pos) {
1923 			p->pend[i]++;
1924 		}
1925 	}
1926 
1927 	memmove(&p->strip[pos+1], &p->strip[pos],
1928 	    (HERE()-pos-1)*sizeof(*p->strip));
1929 	p->strip[pos] = s;
1930 }
1931 
1932 /*
1933  - dofwd - complete a forward reference
1934  == static void dofwd(struct parse *p, sopno pos, sop value);
1935  */
1936 static void
1937 dofwd(struct parse *p, sopno pos, sop value)
1938 {
1939 
1940 	_DIAGASSERT(p != NULL);
1941 
1942 	/* avoid making error situations worse */
1943 	if (p->error != 0)
1944 		return;
1945 
1946 	assert(value < 1<<OPSHIFT);
1947 	p->strip[pos] = OP(p->strip[pos]) | value;
1948 }
1949 
1950 /*
1951  - enlarge - enlarge the strip
1952  == static int enlarge(struct parse *p, sopno size);
1953  */
1954 static int
1955 enlarge(struct parse *p, sopno size)
1956 {
1957 	sop *sp;
1958 
1959 	_DIAGASSERT(p != NULL);
1960 
1961 	if (p->ssize >= size)
1962 		return 1;
1963 
1964 	sp = reallocarray(p->strip, size, sizeof(*p->strip));
1965 	if (sp == NULL) {
1966 		SETERROR(REG_ESPACE);
1967 		return 0;
1968 	}
1969 	p->strip = sp;
1970 	p->ssize = size;
1971 	return 1;
1972 }
1973 
1974 /*
1975  - stripsnug - compact the strip
1976  == static void stripsnug(struct parse *p, struct re_guts *g);
1977  */
1978 static void
1979 stripsnug(struct parse *p, struct re_guts *g)
1980 {
1981 
1982 	_DIAGASSERT(p != NULL);
1983 	_DIAGASSERT(g != NULL);
1984 
1985 	g->nstates = p->slen;
1986 	g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip));
1987 	if (g->strip == NULL) {
1988 		SETERROR(REG_ESPACE);
1989 		g->strip = p->strip;
1990 	}
1991 }
1992 
1993 /*
1994  - findmust - fill in must and mlen with longest mandatory literal string
1995  == static void findmust(struct parse *p, struct re_guts *g);
1996  *
1997  * This algorithm could do fancy things like analyzing the operands of |
1998  * for common subsequences.  Someday.  This code is simple and finds most
1999  * of the interesting cases.
2000  *
2001  * Note that must and mlen got initialized during setup.
2002  */
2003 static void
2004 findmust(struct parse *p, struct re_guts *g)
2005 {
2006 	sop *scan;
2007 	sop *start = NULL;
2008 	sop *newstart = NULL;
2009 	sopno newlen;
2010 	sop s;
2011 	char *cp;
2012 	int offset;
2013 	mbstate_t mbs;
2014 
2015 	_DIAGASSERT(p != NULL);
2016 	_DIAGASSERT(g != NULL);
2017 
2018 	/* avoid making error situations worse */
2019 	if (p->error != 0)
2020 		return;
2021 
2022 #ifdef notyet
2023 	/*
2024 	 * It's not generally safe to do a ``char'' substring search on
2025 	 * multibyte character strings, but it's safe for at least
2026 	 * UTF-8 (see RFC 3629).
2027 	 */
2028 	if (MB_CUR_MAX > 1 &&
2029 	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
2030 		return;
2031 #endif
2032 
2033 	/* find the longest OCHAR sequence in strip */
2034 	newlen = 0;
2035 	offset = 0;
2036 	g->moffset = 0;
2037 	scan = g->strip + 1;
2038 	do {
2039 		s = *scan++;
2040 		switch (OP(s)) {
2041 		case OCHAR:		/* sequence member */
2042 			if (newlen == 0) {		/* new sequence */
2043 				memset(&mbs, 0, sizeof(mbs));
2044 				newstart = scan - 1;
2045 			}
2046 #ifdef NLS
2047 			char buf[MB_LEN_MAX];
2048 			size_t clen = wcrtomb(buf, (int)OPND(s), &mbs);
2049 			if (clen == (size_t)-1)
2050 				goto toohard;
2051 			newlen += (sopno)clen;
2052 #else
2053 			newlen++;
2054 #endif
2055 			break;
2056 		case OPLUS_:		/* things that don't break one */
2057 		case OLPAREN:
2058 		case ORPAREN:
2059 			break;
2060 		case OQUEST_:		/* things that must be skipped */
2061 		case OCH_:
2062 			offset = altoffset(scan, offset);
2063 			scan--;
2064 			do {
2065 				scan += OPND(s);
2066 				s = *scan;
2067 				/* assert() interferes w debug printouts */
2068 				if (OP(s) != O_QUEST &&
2069 				    OP(s) != O_CH && OP(s) != OOR2) {
2070 					g->iflags |= BAD;
2071 					return;
2072 				}
2073 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
2074 			/* FALLTHROUGH */
2075 		case OBOW:		/* things that break a sequence */
2076 		case OEOW:
2077 		case OBOL:
2078 		case OEOL:
2079 		case OBOS:
2080 		case OEOS:
2081 		case OWBND:
2082 		case ONWBND:
2083 		case O_QUEST:
2084 		case O_CH:
2085 		case OEND:
2086 			if (newlen > (sopno)g->mlen) {		/* ends one */
2087 				start = newstart;
2088 				g->mlen = newlen;
2089 				if (offset > -1) {
2090 					g->moffset += offset;
2091 					offset = newlen;
2092 				} else
2093 					g->moffset = offset;
2094 			} else {
2095 				if (offset > -1)
2096 					offset += newlen;
2097 			}
2098 			newlen = 0;
2099 			break;
2100 		case OANY:
2101 			if (newlen > (sopno)g->mlen) {		/* ends one */
2102 				start = newstart;
2103 				g->mlen = newlen;
2104 				if (offset > -1) {
2105 					g->moffset += offset;
2106 					offset = newlen;
2107 				} else
2108 					g->moffset = offset;
2109 			} else {
2110 				if (offset > -1)
2111 					offset += newlen;
2112 			}
2113 			if (offset > -1)
2114 				offset++;
2115 			newlen = 0;
2116 			break;
2117 		case OANYOF:		/* may or may not invalidate offset */
2118 			/* First, everything as OANY */
2119 			if (newlen > (sopno)g->mlen) {		/* ends one */
2120 				start = newstart;
2121 				g->mlen = newlen;
2122 				if (offset > -1) {
2123 					g->moffset += offset;
2124 					offset = newlen;
2125 				} else
2126 					g->moffset = offset;
2127 			} else {
2128 				if (offset > -1)
2129 					offset += newlen;
2130 			}
2131 			if (offset > -1)
2132 				offset++;
2133 			newlen = 0;
2134 			break;
2135 #ifdef NLS
2136 		toohard:/*FALLTHROUGH*/
2137 #endif
2138 		default:
2139 			/* Anything here makes it impossible or too hard
2140 			 * to calculate the offset -- so we give up;
2141 			 * save the last known good offset, in case the
2142 			 * must sequence doesn't occur later.
2143 			 */
2144 			if (newlen > (sopno)g->mlen) {		/* ends one */
2145 				start = newstart;
2146 				g->mlen = newlen;
2147 				if (offset > -1)
2148 					g->moffset += offset;
2149 				else
2150 					g->moffset = offset;
2151 			}
2152 			offset = -1;
2153 			newlen = 0;
2154 			break;
2155 		}
2156 	} while (OP(s) != OEND);
2157 
2158 	if (g->mlen == 0) {		/* there isn't one */
2159 		g->moffset = -1;
2160 		return;
2161 	}
2162 
2163 	/* turn it into a character string */
2164 	g->must = malloc((size_t)g->mlen + 1);
2165 	if (g->must == NULL) {		/* argh; just forget it */
2166 		g->mlen = 0;
2167 		g->moffset = -1;
2168 		return;
2169 	}
2170 	cp = g->must;
2171 	scan = start;
2172 	memset(&mbs, 0, sizeof(mbs));
2173 	while (cp < g->must + g->mlen) {
2174 		while (OP(s = *scan++) != OCHAR)
2175 			continue;
2176 #ifdef NLS
2177 		size_t clen = wcrtomb(cp, (int)OPND(s), &mbs);
2178 		assert(clen != (size_t)-1);
2179 		cp += clen;
2180 #else
2181 		*cp++ = OPND(s);
2182 #endif
2183 	}
2184 	assert(cp == g->must + g->mlen);
2185 	*cp++ = '\0';		/* just on general principles */
2186 }
2187 
2188 /*
2189  - altoffset - choose biggest offset among multiple choices
2190  == static int altoffset(sop *scan, int offset);
2191  *
2192  * Compute, recursively if necessary, the largest offset among multiple
2193  * re paths.
2194  */
2195 static int
2196 altoffset(sop *scan, int offset)
2197 {
2198 	int largest;
2199 	int try;
2200 	sop s;
2201 
2202 	_DIAGASSERT(scan != NULL);
2203 
2204 	/* If we gave up already on offsets, return */
2205 	if (offset == -1)
2206 		return -1;
2207 
2208 	largest = 0;
2209 	try = 0;
2210 	s = *scan++;
2211 	while (OP(s) != O_QUEST && OP(s) != O_CH) {
2212 		switch (OP(s)) {
2213 		case OOR1:
2214 			if (try > largest)
2215 				largest = try;
2216 			try = 0;
2217 			break;
2218 		case OQUEST_:
2219 		case OCH_:
2220 			try = altoffset(scan, try);
2221 			if (try == -1)
2222 				return -1;
2223 			scan--;
2224 			do {
2225 				scan += OPND(s);
2226 				s = *scan;
2227 				if (OP(s) != O_QUEST &&
2228 				    OP(s) != O_CH && OP(s) != OOR2)
2229 					return -1;
2230 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
2231 			/* We must skip to the next position, or we'll
2232 			 * leave altoffset() too early.
2233 			 */
2234 			scan++;
2235 			break;
2236 		case OANYOF:
2237 		case OCHAR:
2238 		case OANY:
2239 			try++;
2240 			/*FALLTHROUGH*/
2241 		case OBOW:
2242 		case OEOW:
2243 		case OWBND:
2244 		case ONWBND:
2245 		case OLPAREN:
2246 		case ORPAREN:
2247 		case OOR2:
2248 			break;
2249 		default:
2250 			try = -1;
2251 			break;
2252 		}
2253 		if (try == -1)
2254 			return -1;
2255 		s = *scan++;
2256 	}
2257 
2258 	if (try > largest)
2259 		largest = try;
2260 
2261 	return largest+offset;
2262 }
2263 
2264 /*
2265  - computejumps - compute char jumps for BM scan
2266  == static void computejumps(struct parse *p, struct re_guts *g);
2267  *
2268  * This algorithm assumes g->must exists and is has size greater than
2269  * zero. It's based on the algorithm found on Computer Algorithms by
2270  * Sara Baase.
2271  *
2272  * A char jump is the number of characters one needs to jump based on
2273  * the value of the character from the text that was mismatched.
2274  */
2275 static void
2276 computejumps(struct parse *p, struct re_guts *g)
2277 {
2278 	int ch;
2279 	size_t mindex;
2280 
2281 	_DIAGASSERT(p != NULL);
2282 	_DIAGASSERT(g != NULL);
2283 
2284 	/* Avoid making errors worse */
2285 	if (p->error != 0)
2286 		return;
2287 
2288 	g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump));
2289 	if (g->charjump == NULL)	/* Not a fatal error */
2290 		return;
2291 	/* Adjust for signed chars, if necessary */
2292 	g->charjump = &g->charjump[-(CHAR_MIN)];
2293 
2294 	/* If the character does not exist in the pattern, the jump
2295 	 * is equal to the number of characters in the pattern.
2296 	 */
2297 	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
2298 		g->charjump[ch] = g->mlen;
2299 
2300 	/* If the character does exist, compute the jump that would
2301 	 * take us to the last character in the pattern equal to it
2302 	 * (notice that we match right to left, so that last character
2303 	 * is the first one that would be matched).
2304 	 */
2305 	for (mindex = 0; mindex < g->mlen; mindex++)
2306 		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
2307 }
2308 
2309 /*
2310  - computematchjumps - compute match jumps for BM scan
2311  == static void computematchjumps(struct parse *p, struct re_guts *g);
2312  *
2313  * This algorithm assumes g->must exists and is has size greater than
2314  * zero. It's based on the algorithm found on Computer Algorithms by
2315  * Sara Baase.
2316  *
2317  * A match jump is the number of characters one needs to advance based
2318  * on the already-matched suffix.
2319  * Notice that all values here are minus (g->mlen-1), because of the way
2320  * the search algorithm works.
2321  */
2322 static void
2323 computematchjumps(struct parse *p, struct re_guts *g)
2324 {
2325 	size_t mindex;		/* General "must" iterator */
2326 	size_t suffix;		/* Keeps track of matching suffix */
2327 	size_t ssuffix;		/* Keeps track of suffixes' suffix */
2328 	size_t* pmatches;	/* pmatches[k] points to the next i
2329 				 * such that i+1...mlen is a substring
2330 				 * of k+1...k+mlen-i-1
2331 				 */
2332 
2333 	_DIAGASSERT(p != NULL);
2334 	_DIAGASSERT(g != NULL);
2335 
2336 	/* Avoid making errors worse */
2337 	if (p->error != 0)
2338 		return;
2339 
2340 	pmatches = calloc(g->mlen, sizeof(*pmatches));
2341 	if (pmatches == NULL) {
2342 		g->matchjump = NULL;
2343 		return;
2344 	}
2345 
2346 	g->matchjump = calloc(g->mlen, sizeof(*g->matchjump));
2347 	if (g->matchjump == NULL) {	/* Not a fatal error */
2348 		free(pmatches);
2349 		return;
2350 	}
2351 
2352 	/* Set maximum possible jump for each character in the pattern */
2353 	for (mindex = 0; mindex < g->mlen; mindex++)
2354 		g->matchjump[mindex] = 2 * g->mlen - mindex - 1;
2355 
2356 	/* Compute pmatches[] */
2357 	for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) {
2358 		pmatches[mindex] = suffix;
2359 
2360 		/* If a mismatch is found, interrupting the substring,
2361 		 * compute the matchjump for that position. If no
2362 		 * mismatch is found, then a text substring mismatched
2363 		 * against the suffix will also mismatch against the
2364 		 * substring.
2365 		 */
2366 		while (suffix < g->mlen
2367 		    && g->must[mindex] != g->must[suffix]) {
2368 			g->matchjump[suffix] = MIN(g->matchjump[suffix],
2369 			    g->mlen - mindex - 1);
2370 			suffix = pmatches[suffix];
2371 		}
2372 	}
2373 
2374 	/* Compute the matchjump up to the last substring found to jump
2375 	 * to the beginning of the largest must pattern prefix matching
2376 	 * it's own suffix.
2377 	 */
2378 	for (mindex = 0; mindex <= suffix; mindex++)
2379 		g->matchjump[mindex] = MIN(g->matchjump[mindex],
2380 		    g->mlen + suffix - mindex);
2381 
2382         ssuffix = pmatches[suffix];
2383         while (suffix < g->mlen) {
2384                 while (suffix <= ssuffix && suffix < g->mlen) {
2385                         g->matchjump[suffix] = MIN(g->matchjump[suffix],
2386 			    g->mlen + ssuffix - suffix);
2387                         suffix++;
2388                 }
2389 		if (suffix < g->mlen)
2390                 	ssuffix = pmatches[ssuffix];
2391         }
2392 
2393 	free(pmatches);
2394 }
2395 
2396 /*
2397  - pluscount - count + nesting
2398  == static sopno pluscount(struct parse *p, struct re_guts *g);
2399  */
2400 static sopno			/* nesting depth */
2401 pluscount(struct parse *p, struct re_guts *g)
2402 {
2403 	sop *scan;
2404 	sop s;
2405 	sopno plusnest = 0;
2406 	sopno maxnest = 0;
2407 
2408 	_DIAGASSERT(p != NULL);
2409 	_DIAGASSERT(g != NULL);
2410 
2411 	if (p->error != 0)
2412 		return(0);	/* there may not be an OEND */
2413 
2414 	scan = g->strip + 1;
2415 	do {
2416 		s = *scan++;
2417 		switch (OP(s)) {
2418 		case OPLUS_:
2419 			plusnest++;
2420 			break;
2421 		case O_PLUS:
2422 			if (plusnest > maxnest)
2423 				maxnest = plusnest;
2424 			plusnest--;
2425 			break;
2426 		}
2427 	} while (OP(s) != OEND);
2428 	if (plusnest != 0)
2429 		g->iflags |= BAD;
2430 	return(maxnest);
2431 }
2432