1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * IMPORTANT NOTE:
32 *
33 * regcmp() WORKS **ONLY** WITH THE ASCII AND THE Solaris EUC CHARACTER SETS.
34 * IT IS **NOT** CHARACTER SET INDEPENDENT.
35 *
36 */
37
38 #pragma weak _regcmp = regcmp
39
40 #include "lint.h"
41 #include "mtlib.h"
42 #include <limits.h>
43 #include <stdarg.h>
44 #include <stdlib.h>
45 #include <thread.h>
46 #include <wctype.h>
47 #include <widec.h>
48 #include <string.h>
49 #include "tsd.h"
50
51
52 /* CONSTANTS SHARED WITH regex() */
53
54 #include "regex.h"
55
56 /* PRIVATE CONSTANTS */
57
58 #define BACKSLASH '\\'
59 #define CIRCUMFLEX '^'
60 #define COMMA ','
61 #define DASH '-'
62 #define DOLLAR_SIGN '$'
63 #define DOT '.'
64 #define LEFT_CURLY_BRACE '{'
65 #define LEFT_PAREN '('
66 #define LEFT_SQUARE_BRACKET '['
67 #define PLUS '+'
68 #define RIGHT_CURLY_BRACE '}'
69 #define RIGHT_PAREN ')'
70 #define RIGHT_SQUARE_BRACKET ']'
71 #define SINGLE_BYTE_MASK 0xff
72 #define STRINGP_STACK_SIZE 50
73 #define STAR '*'
74
75 /* PRIVATE GLOBAL VARIABLES */
76
77 static char *compilep_stack[STRINGP_STACK_SIZE];
78 static char **compilep_stackp;
79 static mutex_t regcmp_lock = DEFAULTMUTEX;
80
81 /* DECLARATIONS OF PRIVATE FUNCTIONS */
82
83 static int add_char(char *compilep, wchar_t wchar);
84 static int add_single_char_expr(char *compilep, wchar_t wchar);
85
86 #define ERROR_EXIT(mutex_lockp, arg_listp, compile_startp) \
87 \
88 va_end(arg_listp); \
89 lmutex_unlock(mutex_lockp); \
90 if ((compile_startp) != (char *)0) \
91 free((void *)compile_startp); \
92 return ((char *)0)
93
94 static int get_count(int *countp, const char *regexp);
95 static int get_digit(const char *regexp);
96 static int get_wchar(wchar_t *wchar, const char *regexp);
97 static char *pop_compilep(void);
98 static char *push_compilep(char *compilep);
99 static boolean_t valid_range(wchar_t lower_char, wchar_t upper_char);
100
101
102 /* DEFINITIONS OF PUBLIC VARIABLES */
103
104 int __i_size;
105
106 /*
107 * define thread-specific storage for __i_size
108 *
109 */
110 int *
___i_size(void)111 ___i_size(void)
112 {
113 if (thr_main())
114 return (&__i_size);
115 return ((int *)tsdalloc(_T_REGCMP_ISIZE, sizeof (int), NULL));
116 }
117
118 #define __i_size (*(___i_size()))
119
120 /* DEFINITION OF regcmp() */
121
122 extern char *
regcmp(const char * regexp,...)123 regcmp(const char *regexp, ...)
124 {
125 va_list arg_listp;
126 size_t arg_strlen;
127 boolean_t can_repeat;
128 int char_size;
129 unsigned int class_length;
130 char *compilep;
131 char *compile_startp = (char *)0;
132 int count_length;
133 wchar_t current_char;
134 int expr_length;
135 int groupn;
136 unsigned int group_length;
137 unsigned int high_bits;
138 boolean_t dash_indicates_range;
139 unsigned int low_bits;
140 int max_count;
141 int min_count;
142 const char *next_argp;
143 wchar_t first_char_in_range;
144 char *regex_typep;
145 int return_arg_number;
146 int substringn;
147
148 if (___i_size() == (int *)0)
149 return ((char *)0);
150
151 /*
152 * When compiling a regular expression, regcmp() generates at most
153 * two extra single-byte characters for each character in the
154 * expression, so allocating three times the number of bytes in all
155 * the strings that comprise the regular expression will ensure that
156 * regcmp() won't overwrite the end of the allocated block when
157 * compiling the expression.
158 */
159
160 va_start(arg_listp, regexp);
161 next_argp = regexp;
162 arg_strlen = 0;
163 while (next_argp != (char *)0) {
164 arg_strlen += strlen(next_argp);
165 next_argp = va_arg(arg_listp, /* const */ char *);
166 }
167 va_end(arg_listp);
168
169 if (arg_strlen == 0)
170 return ((char *)0);
171 compile_startp = (char *)malloc(3 * arg_strlen + 1);
172 if (compile_startp == (char *)0)
173 return ((char *)0);
174
175 lmutex_lock(®cmp_lock);
176 __i_size = 0;
177 compilep = compile_startp;
178 compilep_stackp = &compilep_stack[STRINGP_STACK_SIZE];
179
180 /* GET THE FIRST CHARACTER IN THE REGULAR EXPRESSION */
181 va_start(arg_listp, regexp);
182 next_argp = va_arg(arg_listp, /* const */ char *);
183 char_size = get_wchar(¤t_char, regexp);
184 if (char_size < 0) {
185 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp);
186 } else if (char_size > 0) {
187 regexp += char_size;
188 } else /* (char_size == 0 ) */ {
189 regexp = next_argp;
190 next_argp = va_arg(arg_listp, /* const */ char *);
191 char_size = get_wchar(¤t_char, regexp);
192 if (char_size <= 0) {
193 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp);
194 } else {
195 regexp += char_size;
196 }
197 }
198
199 /* FIND OUT IF THE EXPRESSION MUST START AT THE START OF A STRING */
200
201 if (current_char == CIRCUMFLEX) {
202 char_size = get_wchar(¤t_char, regexp);
203 if (char_size < 0) {
204 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp);
205 } else if (char_size > 0) {
206 regexp += char_size;
207 *compilep = (unsigned char)START_OF_STRING_MARK;
208 compilep++;
209 } else if /* (char_size == 0) && */ (next_argp != (char *)0) {
210 regexp = next_argp;
211 next_argp = va_arg(arg_listp, /* const */ char *);
212 char_size = get_wchar(¤t_char, regexp);
213 if (char_size <= 0) {
214 ERROR_EXIT(®cmp_lock, arg_listp,
215 compile_startp);
216 } else {
217 regexp += char_size;
218 }
219 *compilep = (unsigned char)START_OF_STRING_MARK;
220 compilep++;
221 } else {
222 /* ((char_size==0) && (next_argp==(char *)0)) */
223 /*
224 * the regular expression is "^"
225 */
226 *compilep = (unsigned char)START_OF_STRING_MARK;
227 compilep++;
228 *compilep = (unsigned char)END_REGEX;
229 compilep++;
230 *compilep = '\0';
231 compilep++;
232 __i_size = (int)(compilep - compile_startp);
233 va_end(arg_listp);
234 lmutex_unlock(®cmp_lock);
235 return (compile_startp);
236 }
237 }
238
239 /* COMPILE THE REGULAR EXPRESSION */
240
241 groupn = 0;
242 substringn = 0;
243 can_repeat = B_FALSE;
244 for (;;) {
245
246 /*
247 * At the end of each iteration get the next character
248 * from the regular expression and increment regexp to
249 * point to the following character. Exit when all
250 * the characters in all the strings in the argument
251 * list have been read.
252 */
253
254 switch (current_char) {
255
256 /*
257 * No fall-through. Each case ends with either
258 * a break or an error exit. Each case starts
259 * with compilep addressing the next location to
260 * be written in the compiled regular expression,
261 * and with regexp addressing the next character
262 * to be read from the regular expression being
263 * compiled. Each case that doesn't return
264 * increments regexp to address the next character
265 * to be read from the regular expression and
266 * increments compilep to address the next
267 * location to be written in the compiled
268 * regular expression.
269 *
270 * NOTE: The comments for each case give the meaning
271 * of the regular expression compiled by the case
272 * and the character string written to the compiled
273 * regular expression by the case. Each single
274 * character
275 * written to the compiled regular expression is
276 * shown enclosed in angle brackets (<>). Each
277 * compiled regular expression begins with a marker
278 * character which is shown as a named constant
279 * (e.g. <ASCII_CHAR>). Character constants are
280 * shown enclosed in single quotes (e.g. <'$'>).
281 * All other single characters written to the
282 * compiled regular expression are shown as lower
283 * case variable names (e.g. <ascii_char> or
284 * <multibyte_char>). Multicharacter
285 * strings written to the compiled regular expression
286 * are shown as variable names followed by elipses
287 * (e.g. <regex...>).
288 */
289
290 case DOLLAR_SIGN:
291 /* end of string marker or simple dollar sign */
292 /* compiles to <END_OF_STRING_MARK> or */
293 /* <ASCII_CHAR><'$'> */
294
295 char_size = get_wchar(¤t_char, regexp);
296 if ((char_size == 0) && (next_argp == (char *)0)) {
297 can_repeat = B_FALSE;
298 *compilep = (unsigned char)END_OF_STRING_MARK;
299 compilep++;
300 } else {
301 can_repeat = B_TRUE;
302 *compilep = (unsigned char)ASCII_CHAR;
303 regex_typep = compilep;
304 compilep++;
305 *compilep = DOLLAR_SIGN;
306 compilep++;
307 }
308 break; /* end case DOLLAR_SIGN */
309
310 case DOT: /* any character */
311
312 /* compiles to <ANY_CHAR> */
313
314 can_repeat = B_TRUE;
315 *compilep = (unsigned char)ANY_CHAR;
316 regex_typep = compilep;
317 compilep++;
318
319 break; /* end case DOT */
320
321 case BACKSLASH: /* escaped character */
322
323 /*
324 * compiles to <ASCII_CHAR><ascii_char> or
325 * <MULTIBYTE_CHAR><multibyte_char>
326 */
327
328 char_size = get_wchar(¤t_char, regexp);
329 if (char_size <= 0) {
330 ERROR_EXIT(®cmp_lock, arg_listp,
331 compile_startp);
332 } else {
333 regexp += char_size;
334 can_repeat = B_TRUE;
335 expr_length = add_single_char_expr(
336 compilep, current_char);
337 regex_typep = compilep;
338 compilep += expr_length;
339 }
340 break; /* end case '\\' */
341
342 case LEFT_SQUARE_BRACKET:
343 /* start of a character class expression */
344
345 /*
346 * [^...c...] compiles to
347 * <NOT_IN_CLASS><class_length><...c...>
348 * [^...a-z...] compiles to
349 * <NOT_IN_CLASS><class_length><...a<THRU>z...>
350 * [...c...] compiles to
351 * <IN_CLASS><class_length><...c...>
352 * [...a-z...] compiles to
353 * <IN_CLASS><class_length><...a<THRU>z...>
354 *
355 * NOTE: <class_length> includes the
356 * <class_length> byte
357 */
358
359 can_repeat = B_TRUE;
360 regex_typep = compilep;
361
362 /* DETERMINE THE CLASS TYPE */
363
364 /*
365 * NOTE: This algorithm checks the value of the
366 * "multibyte"
367 * macro in <euc.h> (included in <widec.h> )
368 * to find out if regcmp()
369 * is compiling the regular expression in a
370 * multibyte locale.
371 */
372 char_size = get_wchar(¤t_char, regexp);
373 if (char_size <= 0) {
374 ERROR_EXIT(®cmp_lock, arg_listp,
375 compile_startp);
376 } else if (current_char == CIRCUMFLEX) {
377 regexp++;
378 char_size = get_wchar(¤t_char, regexp);
379 if (char_size <= 0) {
380 ERROR_EXIT(®cmp_lock,
381 arg_listp, compile_startp);
382 } else {
383 regexp += char_size;
384 if (!multibyte) {
385 *compilep = (unsigned char)
386 NOT_IN_ASCII_CHAR_CLASS;
387 } else {
388 *compilep = (unsigned char)
389 NOT_IN_MULTIBYTE_CHAR_CLASS;
390 }
391 /* leave space for <class_length> */
392 compilep += 2;
393 }
394 } else {
395 regexp += char_size;
396 if (!multibyte) {
397 *compilep = (unsigned char)
398 IN_ASCII_CHAR_CLASS;
399 } else {
400 *compilep = (unsigned char)
401 IN_MULTIBYTE_CHAR_CLASS;
402 }
403 /* leave space for <class_length> */
404 compilep += 2;
405 }
406
407 /* COMPILE THE CLASS */
408 /*
409 * check for a leading right square bracket,
410 * which is allowed
411 */
412
413 if (current_char == RIGHT_SQUARE_BRACKET) {
414 /*
415 * the leading RIGHT_SQUARE_BRACKET may
416 * be part of a character range
417 * expression like "[]-\]"
418 */
419 dash_indicates_range = B_TRUE;
420 first_char_in_range = current_char;
421 char_size = get_wchar(¤t_char, regexp);
422 if (char_size <= 0) {
423 ERROR_EXIT(®cmp_lock,
424 arg_listp, compile_startp);
425 } else {
426 regexp += char_size;
427 *compilep = RIGHT_SQUARE_BRACKET;
428 compilep++;
429 }
430 } else {
431 /*
432 * decode the character in the following
433 * while loop and decide then if it can
434 * be the first character
435 * in a character range expression
436 */
437 dash_indicates_range = B_FALSE;
438 }
439
440 while (current_char != RIGHT_SQUARE_BRACKET) {
441 if (current_char != DASH) {
442 /*
443 * if a DASH follows current_char,
444 * current_char, the DASH and the
445 * character that follows the DASH
446 * may form a character range
447 * expression
448 */
449 dash_indicates_range = B_TRUE;
450 first_char_in_range = current_char;
451 expr_length = add_char(
452 compilep, current_char);
453 compilep += expr_length;
454
455 } else if /* (current_char == DASH) && */
456 (dash_indicates_range == B_FALSE) {
457 /*
458 * current_char is a DASH, but
459 * either begins the entire
460 * character class or follows a
461 * character that's already
462 * part of a character range
463 * expression, so it simply
464 * represents the DASH character
465 * itself
466 */
467 *compilep = DASH;
468 compilep ++;
469 /*
470 * if another DASH follows this
471 * one, this DASH is part
472 * of a character range expression
473 * like "[--\]"
474 */
475 dash_indicates_range = B_TRUE;
476 first_char_in_range = current_char;
477
478 } else {
479 /*
480 * ((current_char == DASH &&/
481 * (dash_indicates_range == B_TRUE))
482 */
483
484 /*
485 * the DASH appears after a single
486 * character that isn't
487 * already part of a character
488 * range expression, so it
489 * and the characters preceding
490 * and following it can form a
491 * character range expression
492 * like "[a-z]"
493 */
494 char_size = get_wchar(
495 ¤t_char, regexp);
496 if (char_size <= 0) {
497 ERROR_EXIT(®cmp_lock,
498 arg_listp, compile_startp);
499
500 } else if (current_char ==
501 RIGHT_SQUARE_BRACKET) {
502 /*
503 * the preceding DASH is
504 * the last character in the
505 * class and represents the
506 * DASH character itself
507 */
508 *compilep = DASH;
509 compilep++;
510
511 } else if (valid_range(
512 first_char_in_range,
513 current_char) == B_FALSE) {
514 ERROR_EXIT(®cmp_lock,
515 arg_listp, compile_startp);
516 } else {
517 /*
518 * the DASH is part of a
519 * character range
520 * expression; encode the
521 * rest of the expression
522 */
523 regexp += char_size;
524 *compilep = (unsigned char)
525 THRU;
526 compilep++;
527 expr_length = add_char(
528 compilep, current_char);
529 compilep += expr_length;
530 /*
531 * if a DASH follows this
532 * character range
533 * expression,
534 * it represents the DASH
535 * character itself
536 */
537 dash_indicates_range =
538 B_FALSE;
539 }
540 }
541
542 /* GET THE NEXT CHARACTER */
543
544 char_size = get_wchar(¤t_char, regexp);
545 if (char_size <= 0) {
546 ERROR_EXIT(®cmp_lock,
547 arg_listp, compile_startp);
548 } else {
549 regexp += char_size;
550 }
551
552 }
553 /* end while (current_char != RIGHT_SQUARE_BRACKET) */
554
555 /* INSERT THE LENGTH OF THE CLASS INTO THE */
556 /* COMPILED EXPRESSION */
557
558 class_length = (unsigned int)
559 (compilep - regex_typep - 1);
560 if ((class_length < 2) ||
561 (class_length > MAX_SINGLE_BYTE_INT)) {
562 ERROR_EXIT(®cmp_lock, arg_listp,
563 compile_startp);
564 } else {
565 *(regex_typep + 1) = (unsigned char)
566 class_length;
567 }
568 break; /* end case LEFT_SQUARE_BRACKET */
569
570 case LEFT_PAREN:
571
572 /*
573 * start of a parenthesized group of regular
574 * expressions compiles to <'\0'><'\0'>, leaving
575 * space in the compiled regular expression for
576 * <group_type|ADDED_LENGTH_BITS><group_length>
577 */
578
579 if (push_compilep(compilep) == (char *)0) {
580 /*
581 * groups can contain groups, so group
582 * start pointers
583 * must be saved and restored in sequence
584 */
585 ERROR_EXIT(®cmp_lock, arg_listp,
586 compile_startp);
587 } else {
588 can_repeat = B_FALSE;
589 *compilep = '\0'; /* for debugging */
590 compilep++;
591 *compilep = '\0'; /* for debugging */
592 compilep++;
593 }
594 break; /* end case LEFT_PAREN */
595
596 case RIGHT_PAREN:
597 /* end of a marked group of regular expressions */
598
599 /*
600 * (<regex>)$0-9 compiles to
601 * <SAVED_GROUP><substringn><compiled_regex...>\
602 * <END_SAVED_GROUP><substringn><return_arg_number>
603 * (<regex>)* compiles to
604 * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>
605 * <group_length> <compiled_regex...>
606 * <END_GROUP|ZERO_OR_MORE><groupn>
607 * (<regex>)+ compiles to
608 * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>
609 * <group_length>\
610 * <compiled_regex...><END_GROUP|ONE_OR_MORE>
611 * <groupn>
612 * (<regex>){...} compiles to
613 * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\
614 * <compiled_regex...><END_GROUP|COUNT><groupn>\
615 * <minimum_repeat_count><maximum_repeat_count>
616 * otherwise (<regex>) compiles to
617 * <SIMPLE_GROUP><blank><compiled_regex...>
618 * <END_GROUP><groupn>
619 *
620 * NOTE:
621 *
622 * group_length + (256 * ADDED_LENGTH_BITS) ==
623 * length_of(<compiled_regex...><END_GROUP|...>
624 * <groupn>)
625 * which also ==
626 * length_of(<group_type|ADDED_LENGTH_BITS>
627 * <group_length>\ <compiled_regex...>)
628 * groupn no longer seems to be used, but the code
629 * still computes it to preserve backward
630 * compatibility
631 * with earlier versions of regex().
632 */
633
634 /* RETRIEVE THE ADDRESS OF THE START OF THE GROUP */
635
636 regex_typep = pop_compilep();
637 if (regex_typep == (char *)0) {
638 ERROR_EXIT(®cmp_lock, arg_listp,
639 compile_startp);
640 }
641 char_size = get_wchar(¤t_char, regexp);
642 if (char_size < 0) {
643 ERROR_EXIT(®cmp_lock, arg_listp,
644 compile_startp);
645 } else if (char_size == 0) {
646 *regex_typep = SIMPLE_GROUP;
647 can_repeat = B_TRUE;
648 *compilep = (unsigned char)END_GROUP;
649 regex_typep = compilep;
650 compilep++;
651 *compilep = (unsigned char)groupn;
652 groupn++;
653 compilep++;
654 } else if (current_char == DOLLAR_SIGN) {
655 *regex_typep = SAVED_GROUP;
656 regex_typep++;
657 *regex_typep = (char)substringn;
658 can_repeat = B_FALSE;
659 regexp ++;
660 return_arg_number = get_digit(regexp);
661 if ((return_arg_number < 0) ||
662 (substringn >= NSUBSTRINGS)) {
663 ERROR_EXIT(®cmp_lock, arg_listp,
664 compile_startp);
665 }
666 regexp++;
667 *compilep = (unsigned char)END_SAVED_GROUP;
668 compilep++;
669 *compilep = (unsigned char)substringn;
670 substringn++;
671 compilep++;
672 *compilep = (unsigned char)return_arg_number;
673 compilep++;
674 } else {
675 switch (current_char) {
676 case STAR:
677 *regex_typep = ZERO_OR_MORE_GROUP;
678 break;
679 case PLUS:
680 *regex_typep = ONE_OR_MORE_GROUP;
681 break;
682 case LEFT_CURLY_BRACE:
683 *regex_typep = COUNTED_GROUP;
684 break;
685 default:
686 *regex_typep = SIMPLE_GROUP;
687 }
688 if (*regex_typep != SIMPLE_GROUP) {
689 group_length = (unsigned int)
690 (compilep - regex_typep);
691 if (group_length >= 1024) {
692 ERROR_EXIT(®cmp_lock,
693 arg_listp, compile_startp);
694 }
695 high_bits = group_length >>
696 TIMES_256_SHIFT;
697 low_bits = group_length &
698 SINGLE_BYTE_MASK;
699 *regex_typep =
700 (unsigned char)
701 ((unsigned int)
702 *regex_typep | high_bits);
703 regex_typep++;
704 *regex_typep =
705 (unsigned char)low_bits;
706 }
707 can_repeat = B_TRUE;
708 *compilep = (unsigned char)END_GROUP;
709 regex_typep = compilep;
710 compilep++;
711 *compilep = (unsigned char)groupn;
712 groupn++;
713 compilep++;
714 }
715
716 break; /* end case RIGHT_PAREN */
717
718 case STAR: /* zero or more repetitions of the */
719 /* preceding expression */
720
721 /*
722 * <regex...>* compiles to <regex_type|ZERO_OR_MORE>\
723 * <compiled_regex...>
724 * (<regex...>)* compiles to
725 * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>\
726 * <group_length><compiled_regex...>\
727 * <END_GROUP|ZERO_OR_MORE><groupn>
728 */
729
730 if (can_repeat == B_FALSE) {
731 ERROR_EXIT(®cmp_lock, arg_listp,
732 compile_startp);
733 } else {
734 can_repeat = B_FALSE;
735 *regex_typep = (unsigned char)
736 ((unsigned int)*regex_typep | ZERO_OR_MORE);
737 }
738 break; /* end case '*' */
739
740 case PLUS:
741 /* one or more repetitions of the preceding */
742 /* expression */
743
744 /*
745 * <regex...>+ compiles to <regex_type|ONE_OR_MORE>\
746 * <compiled_regex...> (<regex...>)+ compiles to
747 * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>\
748 * <group_length><compiled_regex...>\
749 * <END_GROUP|ONE_OR_MORE><groupn>
750 */
751
752 if (can_repeat == B_FALSE) {
753 ERROR_EXIT(®cmp_lock, arg_listp,
754 compile_startp);
755 } else {
756 can_repeat = B_FALSE;
757 *regex_typep =
758 (unsigned char)((unsigned int)*
759 regex_typep | ONE_OR_MORE);
760 }
761 break; /* end case '+' */
762
763 case LEFT_CURLY_BRACE:
764
765 /*
766 * repeat the preceding regular expression
767 * at least min_count times
768 * and at most max_count times
769 *
770 * <regex...>{min_count} compiles to
771 * <regex type|COUNT><compiled_regex...>
772 * <min_count><min_count>
773 *
774 * <regex...>{min_count,} compiles to
775 * <regex type|COUNT><compiled_regex...>
776 * <min_count><UNLIMITED>
777 *
778 * <regex...>{min_count,max_count} compiles to
779 * <regex type>|COUNT><compiled_regex...>
780 * <min_count><max_count>
781 *
782 * (<regex...>){min_count,max_count} compiles to
783 * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\
784 * <compiled_regex...><END_GROUP|COUNT><groupn>\
785 * <minimum_match_count><maximum_match_count>
786 */
787
788 if (can_repeat == B_FALSE) {
789 ERROR_EXIT(®cmp_lock, arg_listp,
790 compile_startp);
791 }
792 can_repeat = B_FALSE;
793 *regex_typep = (unsigned char)((unsigned int)*
794 regex_typep | COUNT);
795 count_length = get_count(&min_count, regexp);
796 if (count_length <= 0) {
797 ERROR_EXIT(®cmp_lock, arg_listp,
798 compile_startp);
799 }
800 regexp += count_length;
801
802 if (*regexp == RIGHT_CURLY_BRACE) { /* {min_count} */
803 regexp++;
804 max_count = min_count;
805 } else if (*regexp == COMMA) { /* {min_count,..} */
806 regexp++;
807 /* {min_count,} */
808 if (*regexp == RIGHT_CURLY_BRACE) {
809 regexp++;
810 max_count = UNLIMITED;
811 } else { /* {min_count,max_count} */
812 count_length = get_count(
813 &max_count, regexp);
814 if (count_length <= 0) {
815 ERROR_EXIT(®cmp_lock,
816 arg_listp, compile_startp);
817 }
818 regexp += count_length;
819 if (*regexp != RIGHT_CURLY_BRACE) {
820 ERROR_EXIT(®cmp_lock,
821 arg_listp, compile_startp);
822 }
823 regexp++;
824 }
825 } else { /* invalid expression */
826 ERROR_EXIT(®cmp_lock, arg_listp,
827 compile_startp);
828 }
829
830 if ((min_count > MAX_SINGLE_BYTE_INT) ||
831 ((max_count != UNLIMITED) &&
832 (min_count > max_count))) {
833 ERROR_EXIT(®cmp_lock, arg_listp,
834 compile_startp);
835 } else {
836 *compilep = (unsigned char)min_count;
837 compilep++;
838 *compilep = (unsigned char)max_count;
839 compilep++;
840 }
841 break; /* end case LEFT_CURLY_BRACE */
842
843 default: /* a single non-special character */
844
845 /*
846 * compiles to <ASCII_CHAR><ascii_char> or
847 * <MULTIBYTE_CHAR><multibyte_char>
848 */
849
850 can_repeat = B_TRUE;
851 regex_typep = compilep;
852 expr_length = add_single_char_expr(compilep,
853 current_char);
854 compilep += expr_length;
855
856 } /* end switch (current_char) */
857
858 /* GET THE NEXT CHARACTER FOR THE WHILE LOOP */
859
860 char_size = get_wchar(¤t_char, regexp);
861 if (char_size < 0) {
862 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp);
863 } else if (char_size > 0) {
864 regexp += char_size;
865 } else if /* (char_size == 0) && */ (next_argp != (char *)0) {
866 regexp = next_argp;
867 next_argp = va_arg(arg_listp, /* const */ char *);
868 char_size = get_wchar(¤t_char, regexp);
869 if (char_size <= 0) {
870 ERROR_EXIT(®cmp_lock, arg_listp,
871 compile_startp);
872 } else {
873 regexp += char_size;
874 }
875 } else /* ((char_size == 0) && (next_argp == (char *)0)) */ {
876 if (pop_compilep() != (char *)0) {
877 /* unmatched parentheses */
878 ERROR_EXIT(®cmp_lock, arg_listp,
879 compile_startp);
880 }
881 *compilep = (unsigned char)END_REGEX;
882 compilep++;
883 *compilep = '\0';
884 compilep++;
885 __i_size = (int)(compilep - compile_startp);
886 va_end(arg_listp);
887 lmutex_unlock(®cmp_lock);
888 return (compile_startp);
889 }
890 } /* end for (;;) */
891
892 } /* regcmp() */
893
894
895 /* DEFINITIONS OF PRIVATE FUNCTIONS */
896
897 static int
add_char(char * compilep,wchar_t wchar)898 add_char(char *compilep, wchar_t wchar)
899 {
900 int expr_length;
901
902 if ((unsigned int)wchar <= (unsigned int)0x7f) {
903 *compilep = (unsigned char)wchar;
904 expr_length = 1;
905 } else {
906 expr_length = wctomb(compilep, wchar);
907 }
908 return (expr_length);
909 }
910
911 static int
add_single_char_expr(char * compilep,wchar_t wchar)912 add_single_char_expr(char *compilep, wchar_t wchar)
913 {
914 int expr_length = 0;
915
916 if ((unsigned int)wchar <= (unsigned int)0x7f) {
917 *compilep = (unsigned char)ASCII_CHAR;
918 compilep++;
919 *compilep = (unsigned char)wchar;
920 expr_length += 2;
921 } else {
922 *compilep = (unsigned char)MULTIBYTE_CHAR;
923 compilep++;
924 expr_length++;
925 expr_length += wctomb(compilep, wchar);
926 }
927 return (expr_length);
928 }
929
930 static int
get_count(int * countp,const char * regexp)931 get_count(int *countp, const char *regexp)
932 {
933 char count_char = '0';
934 int count = 0;
935 int count_length = 0;
936
937 if (regexp == (char *)0) {
938 return ((int)0);
939 } else {
940 count_char = *regexp;
941 while (('0' <= count_char) && (count_char <= '9')) {
942 count = (10 * count) + (int)(count_char - '0');
943 count_length++;
944 regexp++;
945 count_char = *regexp;
946 }
947 }
948 *countp = count;
949 return (count_length);
950 }
951
952 static int
get_digit(const char * regexp)953 get_digit(const char *regexp)
954 {
955 char digit;
956
957 if (regexp == (char *)0) {
958 return ((int)-1);
959 } else {
960 digit = *regexp;
961 if (('0' <= digit) && (digit <= '9')) {
962 return ((int)(digit - '0'));
963 } else {
964 return ((int)-1);
965 }
966 }
967 }
968
969 static int
get_wchar(wchar_t * wcharp,const char * regexp)970 get_wchar(wchar_t *wcharp, const char *regexp)
971 {
972 int char_size;
973
974 if (regexp == (char *)0) {
975 char_size = 0;
976 *wcharp = (wchar_t)((unsigned int)'\0');
977 } else if (*regexp == '\0') {
978 char_size = 0;
979 *wcharp = (wchar_t)((unsigned int)*regexp);
980 } else if ((unsigned char)*regexp <= (unsigned char)0x7f) {
981 char_size = 1;
982 *wcharp = (wchar_t)((unsigned int)*regexp);
983 } else {
984 char_size = mbtowc(wcharp, regexp, MB_LEN_MAX);
985 }
986 return (char_size);
987 }
988
989 static char *
pop_compilep(void)990 pop_compilep(void)
991 {
992 char *compilep;
993
994 if (compilep_stackp >= &compilep_stack[STRINGP_STACK_SIZE]) {
995 return ((char *)0);
996 } else {
997 compilep = *compilep_stackp;
998 compilep_stackp++;
999 return (compilep);
1000 }
1001 }
1002
1003 static char *
push_compilep(char * compilep)1004 push_compilep(char *compilep)
1005 {
1006 if (compilep_stackp <= &compilep_stack[0]) {
1007 return ((char *)0);
1008 } else {
1009 compilep_stackp--;
1010 *compilep_stackp = compilep;
1011 return (compilep);
1012 }
1013 }
1014
1015 static boolean_t
valid_range(wchar_t lower_char,wchar_t upper_char)1016 valid_range(wchar_t lower_char, wchar_t upper_char)
1017 {
1018 return (((lower_char <= 0x7f) && (upper_char <= 0x7f) &&
1019 !iswcntrl(lower_char) && !iswcntrl(upper_char) &&
1020 (lower_char < upper_char)) ||
1021 (((lower_char & WCHAR_CSMASK) ==
1022 (upper_char & WCHAR_CSMASK)) &&
1023 (lower_char < upper_char)));
1024 }
1025