xref: /illumos-gate/usr/src/lib/libc/port/regex/regcmp.c (revision 6bb6b5762ca4b17cd5fb3c6c123f17489d5635aa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved	*/
29 
30 /*
31  * IMPORTANT NOTE:
32  *
33  * regcmp() WORKS **ONLY** WITH THE ASCII AND THE Solaris EUC CHARACTER SETS.
34  * IT IS **NOT** CHARACTER SET INDEPENDENT.
35  *
36  */
37 
38 #pragma weak _regcmp = regcmp
39 
40 #include "lint.h"
41 #include "mtlib.h"
42 #include <limits.h>
43 #include <stdarg.h>
44 #include <stdlib.h>
45 #include <thread.h>
46 #include <wctype.h>
47 #include <widec.h>
48 #include <string.h>
49 #include "tsd.h"
50 
51 
52 /* CONSTANTS SHARED WITH regex() */
53 
54 #include "regex.h"
55 
56 /* PRIVATE CONSTANTS */
57 
58 #define	BACKSLASH		'\\'
59 #define	CIRCUMFLEX		'^'
60 #define	COMMA			','
61 #define	DASH			'-'
62 #define	DOLLAR_SIGN		'$'
63 #define	DOT			'.'
64 #define	LEFT_CURLY_BRACE	'{'
65 #define	LEFT_PAREN		'('
66 #define	LEFT_SQUARE_BRACKET	'['
67 #define	PLUS			'+'
68 #define	RIGHT_CURLY_BRACE	'}'
69 #define	RIGHT_PAREN		')'
70 #define	RIGHT_SQUARE_BRACKET	']'
71 #define	SINGLE_BYTE_MASK	0xff
72 #define	STRINGP_STACK_SIZE	50
73 #define	STAR			'*'
74 
75 /* PRIVATE GLOBAL VARIABLES */
76 
77 static char	*compilep_stack[STRINGP_STACK_SIZE];
78 static char	**compilep_stackp;
79 static mutex_t  regcmp_lock = DEFAULTMUTEX;
80 
81 /* DECLARATIONS OF PRIVATE FUNCTIONS */
82 
83 static int add_char(char *compilep, wchar_t wchar);
84 static int add_single_char_expr(char *compilep, wchar_t wchar);
85 
86 #define	ERROR_EXIT(mutex_lockp, arg_listp, compile_startp) \
87 \
88 	va_end(arg_listp); \
89 	lmutex_unlock(mutex_lockp); \
90 	if ((compile_startp) != (char *)0) \
91 		free((void *)compile_startp); \
92 	return ((char *)0)
93 
94 static int get_count(int *countp, const char *regexp);
95 static int get_digit(const char *regexp);
96 static int get_wchar(wchar_t *wchar, const char *regexp);
97 static char *pop_compilep(void);
98 static char *push_compilep(char *compilep);
99 static boolean_t valid_range(wchar_t lower_char, wchar_t upper_char);
100 
101 
102 /* DEFINITIONS OF PUBLIC VARIABLES */
103 
104 int __i_size;
105 
106 /*
107  * define thread-specific storage for __i_size
108  *
109  */
110 int *
111 ___i_size(void)
112 {
113 	if (thr_main())
114 		return (&__i_size);
115 	return ((int *)tsdalloc(_T_REGCMP_ISIZE, sizeof (int), NULL));
116 }
117 
118 #define		__i_size (*(___i_size()))
119 
120 /* DEFINITION OF regcmp() */
121 
122 extern char *
123 regcmp(const char *regexp, ...)
124 {
125 	va_list		arg_listp;
126 	size_t		arg_strlen;
127 	boolean_t	can_repeat;
128 	int		char_size;
129 	unsigned int	class_length;
130 	char		*compilep;
131 	char		*compile_startp = (char *)0;
132 	int		count_length;
133 	wchar_t		current_char;
134 	int		expr_length;
135 	int		groupn;
136 	unsigned int	group_length;
137 	unsigned int	high_bits;
138 	boolean_t	dash_indicates_range;
139 	unsigned int	low_bits;
140 	int		max_count;
141 	int		min_count;
142 	const char	*next_argp;
143 	wchar_t		first_char_in_range;
144 	char		*regex_typep;
145 	int		return_arg_number;
146 	int		substringn;
147 
148 	if (___i_size() == (int *)0)
149 		return ((char *)0);
150 
151 	/*
152 	 * When compiling a regular expression, regcmp() generates at most
153 	 * two extra single-byte characters for each character in the
154 	 * expression, so allocating three times the number of bytes in all
155 	 * the strings that comprise the regular expression will ensure that
156 	 * regcmp() won't overwrite the end of the allocated block when
157 	 * compiling the expression.
158 	 */
159 
160 	va_start(arg_listp, regexp);
161 	next_argp = regexp;
162 	arg_strlen = 0;
163 	while (next_argp != (char *)0) {
164 		arg_strlen += strlen(next_argp);
165 		next_argp = va_arg(arg_listp, /* const */ char *);
166 	}
167 	va_end(arg_listp);
168 
169 	if (arg_strlen == 0)
170 		return ((char *)0);
171 	compile_startp = (char *)malloc(3 * arg_strlen + 1);
172 	if (compile_startp == (char *)0)
173 		return ((char *)0);
174 
175 	lmutex_lock(&regcmp_lock);
176 	__i_size = 0;
177 	compilep = compile_startp;
178 	compilep_stackp = &compilep_stack[STRINGP_STACK_SIZE];
179 
180 	/* GET THE FIRST CHARACTER IN THE REGULAR EXPRESSION */
181 	va_start(arg_listp, regexp);
182 	next_argp = va_arg(arg_listp, /* const */ char *);
183 	char_size = get_wchar(&current_char, regexp);
184 	if (char_size < 0) {
185 		ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
186 	} else if (char_size > 0) {
187 		regexp += char_size;
188 	} else /* (char_size == 0 ) */ {
189 		regexp = next_argp;
190 		next_argp = va_arg(arg_listp, /* const */ char *);
191 		char_size = get_wchar(&current_char, regexp);
192 		if (char_size <= 0) {
193 			ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
194 		} else {
195 			regexp += char_size;
196 		}
197 	}
198 
199 	/* FIND OUT IF THE EXPRESSION MUST START AT THE START OF A STRING */
200 
201 	if (current_char == CIRCUMFLEX) {
202 		char_size = get_wchar(&current_char, regexp);
203 		if (char_size < 0) {
204 			ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
205 		} else if (char_size > 0) {
206 			regexp += char_size;
207 			*compilep = (unsigned char)START_OF_STRING_MARK;
208 			compilep++;
209 		} else if /* (char_size == 0) && */ (next_argp != (char *)0) {
210 			regexp = next_argp;
211 			next_argp = va_arg(arg_listp, /* const */ char *);
212 			char_size = get_wchar(&current_char, regexp);
213 			if (char_size <= 0) {
214 				ERROR_EXIT(&regcmp_lock, arg_listp,
215 				    compile_startp);
216 			} else {
217 				regexp += char_size;
218 			}
219 			*compilep = (unsigned char)START_OF_STRING_MARK;
220 			compilep++;
221 		} else {
222 			/* ((char_size==0) && (next_argp==(char *)0)) */
223 			/*
224 			 * the regular expression is "^"
225 			 */
226 			*compilep = (unsigned char)START_OF_STRING_MARK;
227 			compilep++;
228 			*compilep = (unsigned char)END_REGEX;
229 			compilep++;
230 			*compilep = '\0';
231 			compilep++;
232 			__i_size = (int)(compilep - compile_startp);
233 			va_end(arg_listp);
234 			lmutex_unlock(&regcmp_lock);
235 			return (compile_startp);
236 		}
237 	}
238 
239 	/* COMPILE THE REGULAR EXPRESSION */
240 
241 	groupn = 0;
242 	substringn = 0;
243 	can_repeat = B_FALSE;
244 	for (;;) {
245 
246 		/*
247 		 * At the end of each iteration get the next character
248 		 * from the regular expression and increment regexp to
249 		 * point to the following character.  Exit when all
250 		 * the characters in all the strings in the argument
251 		 * list have been read.
252 		 */
253 
254 		switch (current_char) {
255 
256 			/*
257 			 * No fall-through.  Each case ends with either
258 			 * a break or an error exit.  Each case starts
259 			 * with compilep addressing the next location to
260 			 * be written in the compiled regular expression,
261 			 * and with regexp addressing the next character
262 			 * to be read from the regular expression being
263 			 * compiled.  Each case that doesn't return
264 			 * increments regexp to address the next character
265 			 * to be read from the regular expression and
266 			 * increments compilep to address the next
267 			 * location to be written in the compiled
268 			 * regular expression.
269 			 *
270 			 * NOTE: The comments for each case give the meaning
271 			 * of the regular expression compiled by the case
272 			 * and the character string written to the compiled
273 			 * regular expression by the case.  Each single
274 			 * character
275 			 * written to the compiled regular expression is
276 			 * shown enclosed in angle brackets (<>).  Each
277 			 * compiled regular expression begins with a marker
278 			 * character which is shown as a named constant
279 			 * (e.g. <ASCII_CHAR>). Character constants are
280 			 * shown enclosed in single quotes (e.g. <'$'>).
281 			 * All other single characters written to the
282 			 * compiled regular expression are shown as lower
283 			 * case variable names (e.g. <ascii_char> or
284 			 * <multibyte_char>). Multicharacter
285 			 * strings written to the compiled regular expression
286 			 * are shown as variable names followed by elipses
287 			 * (e.g. <regex...>).
288 			 */
289 
290 		case DOLLAR_SIGN:
291 			/* end of string marker or simple dollar sign */
292 			/* compiles to <END_OF_STRING_MARK> or */
293 			/* <ASCII_CHAR><'$'> */
294 
295 			char_size = get_wchar(&current_char, regexp);
296 			if ((char_size == 0) && (next_argp == (char *)0)) {
297 				can_repeat = B_FALSE;
298 				*compilep = (unsigned char)END_OF_STRING_MARK;
299 				compilep++;
300 			} else {
301 				can_repeat = B_TRUE;
302 				*compilep = (unsigned char)ASCII_CHAR;
303 				regex_typep = compilep;
304 				compilep++;
305 				*compilep = DOLLAR_SIGN;
306 				compilep++;
307 			}
308 			break; /* end case DOLLAR_SIGN */
309 
310 		case DOT: /* any character */
311 
312 			/* compiles to <ANY_CHAR> */
313 
314 			can_repeat = B_TRUE;
315 			*compilep = (unsigned char)ANY_CHAR;
316 			regex_typep = compilep;
317 			compilep++;
318 
319 			break; /* end case DOT */
320 
321 		case BACKSLASH: /* escaped character */
322 
323 			/*
324 			 * compiles to <ASCII_CHAR><ascii_char> or
325 			 * <MULTIBYTE_CHAR><multibyte_char>
326 			 */
327 
328 			char_size = get_wchar(&current_char, regexp);
329 			if (char_size <= 0) {
330 				ERROR_EXIT(&regcmp_lock, arg_listp,
331 				    compile_startp);
332 			} else {
333 				regexp += char_size;
334 				can_repeat = B_TRUE;
335 				expr_length = add_single_char_expr(
336 				    compilep, current_char);
337 				regex_typep = compilep;
338 				compilep += expr_length;
339 			}
340 			break; /* end case '\\' */
341 
342 		case LEFT_SQUARE_BRACKET:
343 			/* start of a character class expression */
344 
345 			/*
346 			 * [^...c...] compiles to
347 			 * <NOT_IN_CLASS><class_length><...c...>
348 			 * [^...a-z...] compiles to
349 			 * <NOT_IN_CLASS><class_length><...a<THRU>z...>
350 			 * [...c...] compiles to
351 			 * <IN_CLASS><class_length><...c...>
352 			 * [...a-z...] compiles to
353 			 * <IN_CLASS><class_length><...a<THRU>z...>
354 			 *
355 			 * NOTE: <class_length> includes the
356 			 * <class_length> byte
357 			 */
358 
359 			can_repeat = B_TRUE;
360 			regex_typep = compilep;
361 
362 			/* DETERMINE THE CLASS TYPE */
363 
364 			/*
365 			 * NOTE: This algorithm checks the value of the
366 			 * "multibyte"
367 			 * macro in <euc.h> (included in <widec.h> )
368 			 * to find out if regcmp()
369 			 * is compiling the regular expression in a
370 			 * multibyte locale.
371 			 */
372 			char_size = get_wchar(&current_char, regexp);
373 			if (char_size <= 0) {
374 				ERROR_EXIT(&regcmp_lock, arg_listp,
375 				    compile_startp);
376 			} else if (current_char == CIRCUMFLEX) {
377 				regexp++;
378 				char_size = get_wchar(&current_char, regexp);
379 				if (char_size <= 0) {
380 					ERROR_EXIT(&regcmp_lock,
381 					    arg_listp, compile_startp);
382 				} else {
383 					regexp += char_size;
384 					if (!multibyte) {
385 						*compilep = (unsigned char)
386 						    NOT_IN_ASCII_CHAR_CLASS;
387 					} else {
388 						*compilep = (unsigned char)
389 						    NOT_IN_MULTIBYTE_CHAR_CLASS;
390 					}
391 					/* leave space for <class_length> */
392 					compilep += 2;
393 				}
394 			} else {
395 				regexp += char_size;
396 				if (!multibyte) {
397 					*compilep = (unsigned char)
398 					    IN_ASCII_CHAR_CLASS;
399 				} else {
400 					*compilep = (unsigned char)
401 					    IN_MULTIBYTE_CHAR_CLASS;
402 				}
403 				/* leave space for <class_length> */
404 				compilep += 2;
405 			}
406 
407 			/* COMPILE THE CLASS */
408 			/*
409 			 * check for a leading right square bracket,
410 			 * which is allowed
411 			 */
412 
413 			if (current_char == RIGHT_SQUARE_BRACKET) {
414 				/*
415 				 * the leading RIGHT_SQUARE_BRACKET may
416 				 * be part of a character range
417 				 * expression like "[]-\]"
418 				 */
419 				dash_indicates_range = B_TRUE;
420 				first_char_in_range = current_char;
421 				char_size = get_wchar(&current_char, regexp);
422 				if (char_size <= 0) {
423 					ERROR_EXIT(&regcmp_lock,
424 					    arg_listp, compile_startp);
425 				} else {
426 					regexp += char_size;
427 					*compilep = RIGHT_SQUARE_BRACKET;
428 					compilep++;
429 				}
430 			} else {
431 				/*
432 				 * decode the character in the following
433 				 * while loop and decide then if it can
434 				 * be the first character
435 				 * in a character range expression
436 				 */
437 				dash_indicates_range = B_FALSE;
438 			}
439 
440 			while (current_char != RIGHT_SQUARE_BRACKET) {
441 				if (current_char != DASH) {
442 					/*
443 					 * if a DASH follows current_char,
444 					 *  current_char, the DASH and the
445 					 * character that follows the DASH
446 					 * may form a character range
447 					 * expression
448 					 */
449 					dash_indicates_range = B_TRUE;
450 					first_char_in_range = current_char;
451 					expr_length = add_char(
452 					    compilep, current_char);
453 					compilep += expr_length;
454 
455 				} else if /* (current_char == DASH) && */
456 				    (dash_indicates_range == B_FALSE) {
457 					/*
458 					 * current_char is a DASH, but
459 					 * either begins the entire
460 					 * character class or follows a
461 					 * character that's already
462 					 * part of a character range
463 					 * expression, so it simply
464 					 * represents the DASH character
465 					 * itself
466 					 */
467 					*compilep = DASH;
468 					compilep ++;
469 					/*
470 					 * if another DASH follows this
471 					 * one, this DASH is part
472 					 * of a character range expression
473 					 * like "[--\]"
474 					 */
475 					dash_indicates_range = B_TRUE;
476 					first_char_in_range = current_char;
477 
478 				} else {
479 					/*
480 					 * ((current_char == DASH &&/
481 					 * (dash_indicates_range == B_TRUE))
482 					 */
483 
484 					/*
485 					 * the DASH appears after a single
486 					 * character that isn't
487 					 * already part of a character
488 					 * range expression, so it
489 					 * and the characters preceding
490 					 * and following it can form a
491 					 * character range expression
492 					 * like "[a-z]"
493 					 */
494 					char_size = get_wchar(
495 					    &current_char, regexp);
496 					if (char_size <= 0) {
497 						ERROR_EXIT(&regcmp_lock,
498 						    arg_listp, compile_startp);
499 
500 					} else if (current_char ==
501 					    RIGHT_SQUARE_BRACKET) {
502 						/*
503 						 * the preceding DASH is
504 						 * the last character in the
505 						 * class and represents the
506 						 * DASH character itself
507 						 */
508 						*compilep = DASH;
509 						compilep++;
510 
511 					} else if (valid_range(
512 					    first_char_in_range,
513 					    current_char) == B_FALSE) {
514 						ERROR_EXIT(&regcmp_lock,
515 						    arg_listp, compile_startp);
516 					} else {
517 						/*
518 						 * the DASH is part of a
519 						 * character range
520 						 * expression; encode the
521 						 * rest of the expression
522 						 */
523 						regexp += char_size;
524 						*compilep = (unsigned char)
525 						    THRU;
526 						compilep++;
527 						expr_length = add_char(
528 						    compilep, current_char);
529 						compilep += expr_length;
530 						/*
531 						 * if a DASH follows this
532 						 * character range
533 						 * expression,
534 						 * it represents the DASH
535 						 * character itself
536 						 */
537 						dash_indicates_range =
538 						    B_FALSE;
539 					}
540 				}
541 
542 				/* GET THE NEXT CHARACTER */
543 
544 				char_size = get_wchar(&current_char, regexp);
545 				if (char_size <= 0) {
546 					ERROR_EXIT(&regcmp_lock,
547 					    arg_listp, compile_startp);
548 				} else {
549 					regexp += char_size;
550 				}
551 
552 			}
553 			/* end while (current_char != RIGHT_SQUARE_BRACKET) */
554 
555 			/* INSERT THE LENGTH OF THE CLASS INTO THE */
556 			/* COMPILED EXPRESSION */
557 
558 			class_length = (unsigned int)
559 			    (compilep - regex_typep - 1);
560 			if ((class_length < 2) ||
561 			    (class_length > MAX_SINGLE_BYTE_INT)) {
562 				ERROR_EXIT(&regcmp_lock, arg_listp,
563 				    compile_startp);
564 			} else {
565 				*(regex_typep + 1) = (unsigned char)
566 				    class_length;
567 			}
568 			break; /* end case LEFT_SQUARE_BRACKET */
569 
570 		case LEFT_PAREN:
571 
572 			/*
573 			 * start of a parenthesized group of regular
574 			 * expressions compiles to <'\0'><'\0'>, leaving
575 			 * space in the compiled regular expression for
576 			 * <group_type|ADDED_LENGTH_BITS><group_length>
577 			 */
578 
579 			if (push_compilep(compilep) == (char *)0) {
580 				/*
581 				 * groups can contain groups, so group
582 				 * start pointers
583 				 * must be saved and restored in sequence
584 				 */
585 				ERROR_EXIT(&regcmp_lock, arg_listp,
586 				    compile_startp);
587 			} else {
588 				can_repeat = B_FALSE;
589 				*compilep = '\0';	/* for debugging */
590 				compilep++;
591 				*compilep = '\0';	/* for debugging */
592 				compilep++;
593 			}
594 			break; /* end case LEFT_PAREN */
595 
596 		case RIGHT_PAREN:
597 			/* end of a marked group of regular expressions */
598 
599 			/*
600 			 * (<regex>)$0-9 compiles to
601 			 * <SAVED_GROUP><substringn><compiled_regex...>\
602 			 * <END_SAVED_GROUP><substringn><return_arg_number>
603 			 * (<regex>)* compiles to
604 			 * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>
605 			 * <group_length> <compiled_regex...>
606 			 * <END_GROUP|ZERO_OR_MORE><groupn>
607 			 * (<regex>)+ compiles to
608 			 * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>
609 			 * <group_length>\
610 			 * <compiled_regex...><END_GROUP|ONE_OR_MORE>
611 			 * <groupn>
612 			 * (<regex>){...} compiles to
613 			 * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\
614 			 * <compiled_regex...><END_GROUP|COUNT><groupn>\
615 			 * <minimum_repeat_count><maximum_repeat_count>
616 			 * otherwise (<regex>) compiles to
617 			 * <SIMPLE_GROUP><blank><compiled_regex...>
618 			 * <END_GROUP><groupn>
619 			 *
620 			 * NOTE:
621 			 *
622 			 * group_length + (256 * ADDED_LENGTH_BITS) ==
623 			 * length_of(<compiled_regex...><END_GROUP|...>
624 			 * <groupn>)
625 			 * which also ==
626 			 * length_of(<group_type|ADDED_LENGTH_BITS>
627 			 * <group_length>\ <compiled_regex...>)
628 			 * groupn no longer seems to be used, but the code
629 			 * still computes it to preserve backward
630 			 * compatibility
631 			 * with earlier versions of regex().
632 			 */
633 
634 			/* RETRIEVE THE ADDRESS OF THE START OF THE GROUP */
635 
636 			regex_typep = pop_compilep();
637 			if (regex_typep == (char *)0) {
638 				ERROR_EXIT(&regcmp_lock, arg_listp,
639 				    compile_startp);
640 			}
641 			char_size = get_wchar(&current_char, regexp);
642 			if (char_size < 0) {
643 				ERROR_EXIT(&regcmp_lock, arg_listp,
644 				    compile_startp);
645 			} else if (char_size == 0) {
646 				*regex_typep = SIMPLE_GROUP;
647 				can_repeat = B_TRUE;
648 				*compilep = (unsigned char)END_GROUP;
649 				regex_typep = compilep;
650 				compilep++;
651 				*compilep = (unsigned char)groupn;
652 				groupn++;
653 				compilep++;
654 			} else if (current_char == DOLLAR_SIGN) {
655 				*regex_typep = SAVED_GROUP;
656 				regex_typep++;
657 				*regex_typep = (char)substringn;
658 				can_repeat = B_FALSE;
659 				regexp ++;
660 				return_arg_number = get_digit(regexp);
661 				if ((return_arg_number < 0) ||
662 				    (substringn >= NSUBSTRINGS)) {
663 					ERROR_EXIT(&regcmp_lock, arg_listp,
664 					    compile_startp);
665 				}
666 				regexp++;
667 				*compilep = (unsigned char)END_SAVED_GROUP;
668 				compilep++;
669 				*compilep = (unsigned char)substringn;
670 				substringn++;
671 				compilep++;
672 				*compilep = (unsigned char)return_arg_number;
673 				compilep++;
674 			} else {
675 				switch (current_char) {
676 				case STAR:
677 					*regex_typep = ZERO_OR_MORE_GROUP;
678 					break;
679 				case PLUS:
680 					*regex_typep = ONE_OR_MORE_GROUP;
681 					break;
682 				case LEFT_CURLY_BRACE:
683 					*regex_typep = COUNTED_GROUP;
684 					break;
685 				default:
686 					*regex_typep = SIMPLE_GROUP;
687 				}
688 				if (*regex_typep != SIMPLE_GROUP) {
689 					group_length = (unsigned int)
690 					    (compilep - regex_typep);
691 					if (group_length >= 1024) {
692 						ERROR_EXIT(&regcmp_lock,
693 						    arg_listp, compile_startp);
694 					}
695 					high_bits = group_length >>
696 					    TIMES_256_SHIFT;
697 					low_bits = group_length &
698 					    SINGLE_BYTE_MASK;
699 					*regex_typep =
700 					    (unsigned char)
701 					    ((unsigned int)
702 					    *regex_typep | high_bits);
703 					regex_typep++;
704 					*regex_typep =
705 					    (unsigned char)low_bits;
706 				}
707 				can_repeat = B_TRUE;
708 				*compilep = (unsigned char)END_GROUP;
709 				regex_typep = compilep;
710 				compilep++;
711 				*compilep = (unsigned char)groupn;
712 				groupn++;
713 				compilep++;
714 			}
715 
716 			break; /* end case RIGHT_PAREN */
717 
718 		case STAR: /* zero or more repetitions of the */
719 				/* preceding expression */
720 
721 			/*
722 			 * <regex...>* compiles to <regex_type|ZERO_OR_MORE>\
723 			 * <compiled_regex...>
724 			 * (<regex...>)* compiles to
725 			 * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>\
726 			 * <group_length><compiled_regex...>\
727 			 * <END_GROUP|ZERO_OR_MORE><groupn>
728 			 */
729 
730 			if (can_repeat == B_FALSE) {
731 				ERROR_EXIT(&regcmp_lock, arg_listp,
732 				    compile_startp);
733 			} else {
734 				can_repeat = B_FALSE;
735 				*regex_typep = (unsigned char)
736 				    ((unsigned int)*regex_typep | ZERO_OR_MORE);
737 			}
738 			break; /* end case '*' */
739 
740 		case PLUS:
741 			/* one or more repetitions of the preceding */
742 				/* expression */
743 
744 			/*
745 			 * <regex...>+ compiles to <regex_type|ONE_OR_MORE>\
746 			 * <compiled_regex...> (<regex...>)+ compiles to
747 			 * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>\
748 			 * <group_length><compiled_regex...>\
749 			 * <END_GROUP|ONE_OR_MORE><groupn>
750 			 */
751 
752 			if (can_repeat == B_FALSE) {
753 				ERROR_EXIT(&regcmp_lock, arg_listp,
754 				    compile_startp);
755 			} else {
756 				can_repeat = B_FALSE;
757 				*regex_typep =
758 				    (unsigned char)((unsigned int)*
759 				    regex_typep | ONE_OR_MORE);
760 			}
761 			break; /* end case '+' */
762 
763 		case LEFT_CURLY_BRACE:
764 
765 			/*
766 			 * repeat the preceding regular expression
767 			 * at least min_count times
768 			 * and at most max_count times
769 			 *
770 			 * <regex...>{min_count} compiles to
771 			 * <regex type|COUNT><compiled_regex...>
772 			 * <min_count><min_count>
773 			 *
774 			 * <regex...>{min_count,} compiles to
775 			 * <regex type|COUNT><compiled_regex...>
776 			 * <min_count><UNLIMITED>
777 			 *
778 			 * <regex...>{min_count,max_count} compiles to
779 			 * <regex type>|COUNT><compiled_regex...>
780 			 * <min_count><max_count>
781 			 *
782 			 * (<regex...>){min_count,max_count} compiles to
783 			 * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\
784 			 * <compiled_regex...><END_GROUP|COUNT><groupn>\
785 			 * <minimum_match_count><maximum_match_count>
786 			 */
787 
788 			if (can_repeat == B_FALSE) {
789 				ERROR_EXIT(&regcmp_lock, arg_listp,
790 				    compile_startp);
791 			}
792 			can_repeat = B_FALSE;
793 			*regex_typep = (unsigned char)((unsigned int)*
794 			    regex_typep | COUNT);
795 			count_length = get_count(&min_count, regexp);
796 			if (count_length <= 0) {
797 				ERROR_EXIT(&regcmp_lock, arg_listp,
798 				    compile_startp);
799 			}
800 			regexp += count_length;
801 
802 			if (*regexp == RIGHT_CURLY_BRACE) { /* {min_count} */
803 				regexp++;
804 				max_count = min_count;
805 			} else if (*regexp == COMMA) { /* {min_count,..} */
806 				regexp++;
807 				/* {min_count,}   */
808 				if (*regexp == RIGHT_CURLY_BRACE) {
809 					regexp++;
810 					max_count = UNLIMITED;
811 				} else { /* {min_count,max_count} */
812 					count_length = get_count(
813 					    &max_count, regexp);
814 					if (count_length <= 0) {
815 						ERROR_EXIT(&regcmp_lock,
816 						    arg_listp, compile_startp);
817 					}
818 					regexp += count_length;
819 					if (*regexp != RIGHT_CURLY_BRACE) {
820 						ERROR_EXIT(&regcmp_lock,
821 						    arg_listp, compile_startp);
822 					}
823 					regexp++;
824 				}
825 			} else { /* invalid expression */
826 				ERROR_EXIT(&regcmp_lock, arg_listp,
827 				    compile_startp);
828 			}
829 
830 			if ((min_count > MAX_SINGLE_BYTE_INT) ||
831 			    ((max_count != UNLIMITED) &&
832 			    (min_count > max_count))) {
833 				ERROR_EXIT(&regcmp_lock, arg_listp,
834 				    compile_startp);
835 			} else {
836 				*compilep = (unsigned char)min_count;
837 				compilep++;
838 				*compilep = (unsigned char)max_count;
839 				compilep++;
840 			}
841 			break; /* end case LEFT_CURLY_BRACE */
842 
843 		default: /* a single non-special character */
844 
845 			/*
846 			 * compiles to <ASCII_CHAR><ascii_char> or
847 			 * <MULTIBYTE_CHAR><multibyte_char>
848 			 */
849 
850 			can_repeat = B_TRUE;
851 			regex_typep = compilep;
852 			expr_length = add_single_char_expr(compilep,
853 			    current_char);
854 			compilep += expr_length;
855 
856 		} /* end switch (current_char) */
857 
858 		/* GET THE NEXT CHARACTER FOR THE WHILE LOOP */
859 
860 		char_size = get_wchar(&current_char, regexp);
861 		if (char_size < 0) {
862 			ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
863 		} else if (char_size > 0) {
864 			regexp += char_size;
865 		} else if /* (char_size == 0) && */ (next_argp != (char *)0) {
866 			regexp = next_argp;
867 			next_argp = va_arg(arg_listp, /* const */ char *);
868 			char_size = get_wchar(&current_char, regexp);
869 			if (char_size <= 0) {
870 				ERROR_EXIT(&regcmp_lock, arg_listp,
871 				    compile_startp);
872 			} else {
873 				regexp += char_size;
874 			}
875 		} else /* ((char_size == 0) && (next_argp == (char *)0)) */ {
876 			if (pop_compilep() != (char *)0) {
877 				/* unmatched parentheses */
878 				ERROR_EXIT(&regcmp_lock, arg_listp,
879 				    compile_startp);
880 			}
881 			*compilep = (unsigned char)END_REGEX;
882 			compilep++;
883 			*compilep = '\0';
884 			compilep++;
885 			__i_size = (int)(compilep - compile_startp);
886 			va_end(arg_listp);
887 			lmutex_unlock(&regcmp_lock);
888 			return (compile_startp);
889 		}
890 	} /* end for (;;) */
891 
892 } /* regcmp() */
893 
894 
895 /* DEFINITIONS OF PRIVATE FUNCTIONS */
896 
897 static int
898 add_char(char *compilep, wchar_t wchar)
899 {
900 	int expr_length;
901 
902 	if ((unsigned int)wchar <= (unsigned int)0x7f) {
903 		*compilep = (unsigned char)wchar;
904 		expr_length = 1;
905 	} else {
906 		expr_length = wctomb(compilep, wchar);
907 	}
908 	return (expr_length);
909 }
910 
911 static int
912 add_single_char_expr(char *compilep, wchar_t wchar)
913 {
914 	int expr_length = 0;
915 
916 	if ((unsigned int)wchar <= (unsigned int)0x7f) {
917 		*compilep = (unsigned char)ASCII_CHAR;
918 		compilep++;
919 		*compilep = (unsigned char)wchar;
920 		expr_length += 2;
921 	} else {
922 		*compilep = (unsigned char)MULTIBYTE_CHAR;
923 		compilep++;
924 		expr_length++;
925 		expr_length += wctomb(compilep, wchar);
926 	}
927 	return (expr_length);
928 }
929 
930 static int
931 get_count(int *countp, const char *regexp)
932 {
933 	char count_char = '0';
934 	int count = 0;
935 	int count_length = 0;
936 
937 	if (regexp == (char *)0) {
938 		return ((int)0);
939 	} else {
940 		count_char = *regexp;
941 		while (('0' <= count_char) && (count_char <= '9')) {
942 			count = (10 * count) + (int)(count_char - '0');
943 			count_length++;
944 			regexp++;
945 			count_char = *regexp;
946 		}
947 	}
948 	*countp = count;
949 	return (count_length);
950 }
951 
952 static int
953 get_digit(const char *regexp)
954 {
955 	char digit;
956 
957 	if (regexp == (char *)0) {
958 		return ((int)-1);
959 	} else {
960 		digit = *regexp;
961 		if (('0' <= digit) && (digit <= '9')) {
962 			return ((int)(digit - '0'));
963 		} else {
964 			return ((int)-1);
965 		}
966 	}
967 }
968 
969 static int
970 get_wchar(wchar_t *wcharp, const char *regexp)
971 {
972 	int char_size;
973 
974 	if (regexp == (char *)0) {
975 		char_size = 0;
976 		*wcharp = (wchar_t)((unsigned int)'\0');
977 	} else if (*regexp == '\0') {
978 		char_size = 0;
979 		*wcharp = (wchar_t)((unsigned int)*regexp);
980 	} else if ((unsigned char)*regexp <= (unsigned char)0x7f) {
981 		char_size = 1;
982 		*wcharp = (wchar_t)((unsigned int)*regexp);
983 	} else {
984 		char_size = mbtowc(wcharp, regexp, MB_LEN_MAX);
985 	}
986 	return (char_size);
987 }
988 
989 static char *
990 pop_compilep(void)
991 {
992 	char *compilep;
993 
994 	if (compilep_stackp >= &compilep_stack[STRINGP_STACK_SIZE]) {
995 		return ((char *)0);
996 	} else {
997 		compilep = *compilep_stackp;
998 		compilep_stackp++;
999 		return (compilep);
1000 	}
1001 }
1002 
1003 static char *
1004 push_compilep(char *compilep)
1005 {
1006 	if (compilep_stackp <= &compilep_stack[0]) {
1007 		return ((char *)0);
1008 	} else {
1009 		compilep_stackp--;
1010 		*compilep_stackp = compilep;
1011 		return (compilep);
1012 	}
1013 }
1014 
1015 static boolean_t
1016 valid_range(wchar_t lower_char, wchar_t upper_char)
1017 {
1018 	return (((lower_char <= 0x7f) && (upper_char <= 0x7f) &&
1019 	    !iswcntrl(lower_char) && !iswcntrl(upper_char) &&
1020 	    (lower_char < upper_char)) ||
1021 	    (((lower_char & WCHAR_CSMASK) ==
1022 	    (upper_char & WCHAR_CSMASK)) &&
1023 	    (lower_char < upper_char)));
1024 }
1025