1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * IMPORTANT NOTE: 32 * 33 * regcmp() WORKS **ONLY** WITH THE ASCII AND THE Solaris EUC CHARACTER SETS. 34 * IT IS **NOT** CHARACTER SET INDEPENDENT. 35 * 36 */ 37 38 #pragma weak _regcmp = regcmp 39 40 #include "lint.h" 41 #include "mtlib.h" 42 #include <limits.h> 43 #include <stdarg.h> 44 #include <stdlib.h> 45 #include <thread.h> 46 #include <wctype.h> 47 #include <widec.h> 48 #include <string.h> 49 #include "tsd.h" 50 51 52 /* CONSTANTS SHARED WITH regex() */ 53 54 #include "regex.h" 55 56 /* PRIVATE CONSTANTS */ 57 58 #define BACKSLASH '\\' 59 #define CIRCUMFLEX '^' 60 #define COMMA ',' 61 #define DASH '-' 62 #define DOLLAR_SIGN '$' 63 #define DOT '.' 64 #define LEFT_CURLY_BRACE '{' 65 #define LEFT_PAREN '(' 66 #define LEFT_SQUARE_BRACKET '[' 67 #define PLUS '+' 68 #define RIGHT_CURLY_BRACE '}' 69 #define RIGHT_PAREN ')' 70 #define RIGHT_SQUARE_BRACKET ']' 71 #define SINGLE_BYTE_MASK 0xff 72 #define STRINGP_STACK_SIZE 50 73 #define STAR '*' 74 75 /* PRIVATE GLOBAL VARIABLES */ 76 77 static char *compilep_stack[STRINGP_STACK_SIZE]; 78 static char **compilep_stackp; 79 static mutex_t regcmp_lock = DEFAULTMUTEX; 80 81 /* DECLARATIONS OF PRIVATE FUNCTIONS */ 82 83 static int add_char(char *compilep, wchar_t wchar); 84 static int add_single_char_expr(char *compilep, wchar_t wchar); 85 86 #define ERROR_EXIT(mutex_lockp, arg_listp, compile_startp) \ 87 \ 88 va_end(arg_listp); \ 89 lmutex_unlock(mutex_lockp); \ 90 if ((compile_startp) != (char *)0) \ 91 free((void *)compile_startp); \ 92 return ((char *)0) 93 94 static int get_count(int *countp, const char *regexp); 95 static int get_digit(const char *regexp); 96 static int get_wchar(wchar_t *wchar, const char *regexp); 97 static char *pop_compilep(void); 98 static char *push_compilep(char *compilep); 99 static boolean_t valid_range(wchar_t lower_char, wchar_t upper_char); 100 101 102 /* DEFINITIONS OF PUBLIC VARIABLES */ 103 104 int __i_size; 105 106 /* 107 * define thread-specific storage for __i_size 108 * 109 */ 110 int * 111 ___i_size(void) 112 { 113 if (thr_main()) 114 return (&__i_size); 115 return ((int *)tsdalloc(_T_REGCMP_ISIZE, sizeof (int), NULL)); 116 } 117 118 #define __i_size (*(___i_size())) 119 120 /* DEFINITION OF regcmp() */ 121 122 extern char * 123 regcmp(const char *regexp, ...) 124 { 125 va_list arg_listp; 126 size_t arg_strlen; 127 boolean_t can_repeat; 128 int char_size; 129 unsigned int class_length; 130 char *compilep; 131 char *compile_startp = (char *)0; 132 int count_length; 133 wchar_t current_char; 134 int expr_length; 135 int groupn; 136 unsigned int group_length; 137 unsigned int high_bits; 138 boolean_t dash_indicates_range; 139 unsigned int low_bits; 140 int max_count; 141 int min_count; 142 const char *next_argp; 143 wchar_t first_char_in_range; 144 char *regex_typep; 145 int return_arg_number; 146 int substringn; 147 148 if (___i_size() == (int *)0) 149 return ((char *)0); 150 151 /* 152 * When compiling a regular expression, regcmp() generates at most 153 * two extra single-byte characters for each character in the 154 * expression, so allocating three times the number of bytes in all 155 * the strings that comprise the regular expression will ensure that 156 * regcmp() won't overwrite the end of the allocated block when 157 * compiling the expression. 158 */ 159 160 va_start(arg_listp, regexp); 161 next_argp = regexp; 162 arg_strlen = 0; 163 while (next_argp != (char *)0) { 164 arg_strlen += strlen(next_argp); 165 next_argp = va_arg(arg_listp, /* const */ char *); 166 } 167 va_end(arg_listp); 168 169 if (arg_strlen == 0) 170 return ((char *)0); 171 compile_startp = (char *)malloc(3 * arg_strlen + 1); 172 if (compile_startp == (char *)0) 173 return ((char *)0); 174 175 lmutex_lock(®cmp_lock); 176 __i_size = 0; 177 compilep = compile_startp; 178 compilep_stackp = &compilep_stack[STRINGP_STACK_SIZE]; 179 180 /* GET THE FIRST CHARACTER IN THE REGULAR EXPRESSION */ 181 va_start(arg_listp, regexp); 182 next_argp = va_arg(arg_listp, /* const */ char *); 183 char_size = get_wchar(¤t_char, regexp); 184 if (char_size < 0) { 185 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); 186 } else if (char_size > 0) { 187 regexp += char_size; 188 } else /* (char_size == 0 ) */ { 189 regexp = next_argp; 190 next_argp = va_arg(arg_listp, /* const */ char *); 191 char_size = get_wchar(¤t_char, regexp); 192 if (char_size <= 0) { 193 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); 194 } else { 195 regexp += char_size; 196 } 197 } 198 199 /* FIND OUT IF THE EXPRESSION MUST START AT THE START OF A STRING */ 200 201 if (current_char == CIRCUMFLEX) { 202 char_size = get_wchar(¤t_char, regexp); 203 if (char_size < 0) { 204 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); 205 } else if (char_size > 0) { 206 regexp += char_size; 207 *compilep = (unsigned char)START_OF_STRING_MARK; 208 compilep++; 209 } else if /* (char_size == 0) && */ (next_argp != (char *)0) { 210 regexp = next_argp; 211 next_argp = va_arg(arg_listp, /* const */ char *); 212 char_size = get_wchar(¤t_char, regexp); 213 if (char_size <= 0) { 214 ERROR_EXIT(®cmp_lock, arg_listp, 215 compile_startp); 216 } else { 217 regexp += char_size; 218 } 219 *compilep = (unsigned char)START_OF_STRING_MARK; 220 compilep++; 221 } else { 222 /* ((char_size==0) && (next_argp==(char *)0)) */ 223 /* 224 * the regular expression is "^" 225 */ 226 *compilep = (unsigned char)START_OF_STRING_MARK; 227 compilep++; 228 *compilep = (unsigned char)END_REGEX; 229 compilep++; 230 *compilep = '\0'; 231 compilep++; 232 __i_size = (int)(compilep - compile_startp); 233 va_end(arg_listp); 234 lmutex_unlock(®cmp_lock); 235 return (compile_startp); 236 } 237 } 238 239 /* COMPILE THE REGULAR EXPRESSION */ 240 241 groupn = 0; 242 substringn = 0; 243 can_repeat = B_FALSE; 244 for (;;) { 245 246 /* 247 * At the end of each iteration get the next character 248 * from the regular expression and increment regexp to 249 * point to the following character. Exit when all 250 * the characters in all the strings in the argument 251 * list have been read. 252 */ 253 254 switch (current_char) { 255 256 /* 257 * No fall-through. Each case ends with either 258 * a break or an error exit. Each case starts 259 * with compilep addressing the next location to 260 * be written in the compiled regular expression, 261 * and with regexp addressing the next character 262 * to be read from the regular expression being 263 * compiled. Each case that doesn't return 264 * increments regexp to address the next character 265 * to be read from the regular expression and 266 * increments compilep to address the next 267 * location to be written in the compiled 268 * regular expression. 269 * 270 * NOTE: The comments for each case give the meaning 271 * of the regular expression compiled by the case 272 * and the character string written to the compiled 273 * regular expression by the case. Each single 274 * character 275 * written to the compiled regular expression is 276 * shown enclosed in angle brackets (<>). Each 277 * compiled regular expression begins with a marker 278 * character which is shown as a named constant 279 * (e.g. <ASCII_CHAR>). Character constants are 280 * shown enclosed in single quotes (e.g. <'$'>). 281 * All other single characters written to the 282 * compiled regular expression are shown as lower 283 * case variable names (e.g. <ascii_char> or 284 * <multibyte_char>). Multicharacter 285 * strings written to the compiled regular expression 286 * are shown as variable names followed by elipses 287 * (e.g. <regex...>). 288 */ 289 290 case DOLLAR_SIGN: 291 /* end of string marker or simple dollar sign */ 292 /* compiles to <END_OF_STRING_MARK> or */ 293 /* <ASCII_CHAR><'$'> */ 294 295 char_size = get_wchar(¤t_char, regexp); 296 if ((char_size == 0) && (next_argp == (char *)0)) { 297 can_repeat = B_FALSE; 298 *compilep = (unsigned char)END_OF_STRING_MARK; 299 compilep++; 300 } else { 301 can_repeat = B_TRUE; 302 *compilep = (unsigned char)ASCII_CHAR; 303 regex_typep = compilep; 304 compilep++; 305 *compilep = DOLLAR_SIGN; 306 compilep++; 307 } 308 break; /* end case DOLLAR_SIGN */ 309 310 case DOT: /* any character */ 311 312 /* compiles to <ANY_CHAR> */ 313 314 can_repeat = B_TRUE; 315 *compilep = (unsigned char)ANY_CHAR; 316 regex_typep = compilep; 317 compilep++; 318 319 break; /* end case DOT */ 320 321 case BACKSLASH: /* escaped character */ 322 323 /* 324 * compiles to <ASCII_CHAR><ascii_char> or 325 * <MULTIBYTE_CHAR><multibyte_char> 326 */ 327 328 char_size = get_wchar(¤t_char, regexp); 329 if (char_size <= 0) { 330 ERROR_EXIT(®cmp_lock, arg_listp, 331 compile_startp); 332 } else { 333 regexp += char_size; 334 can_repeat = B_TRUE; 335 expr_length = add_single_char_expr( 336 compilep, current_char); 337 regex_typep = compilep; 338 compilep += expr_length; 339 } 340 break; /* end case '\\' */ 341 342 case LEFT_SQUARE_BRACKET: 343 /* start of a character class expression */ 344 345 /* 346 * [^...c...] compiles to 347 * <NOT_IN_CLASS><class_length><...c...> 348 * [^...a-z...] compiles to 349 * <NOT_IN_CLASS><class_length><...a<THRU>z...> 350 * [...c...] compiles to 351 * <IN_CLASS><class_length><...c...> 352 * [...a-z...] compiles to 353 * <IN_CLASS><class_length><...a<THRU>z...> 354 * 355 * NOTE: <class_length> includes the 356 * <class_length> byte 357 */ 358 359 can_repeat = B_TRUE; 360 regex_typep = compilep; 361 362 /* DETERMINE THE CLASS TYPE */ 363 364 /* 365 * NOTE: This algorithm checks the value of the 366 * "multibyte" 367 * macro in <euc.h> (included in <widec.h> ) 368 * to find out if regcmp() 369 * is compiling the regular expression in a 370 * multibyte locale. 371 */ 372 char_size = get_wchar(¤t_char, regexp); 373 if (char_size <= 0) { 374 ERROR_EXIT(®cmp_lock, arg_listp, 375 compile_startp); 376 } else if (current_char == CIRCUMFLEX) { 377 regexp++; 378 char_size = get_wchar(¤t_char, regexp); 379 if (char_size <= 0) { 380 ERROR_EXIT(®cmp_lock, 381 arg_listp, compile_startp); 382 } else { 383 regexp += char_size; 384 if (!multibyte) { 385 *compilep = (unsigned char) 386 NOT_IN_ASCII_CHAR_CLASS; 387 } else { 388 *compilep = (unsigned char) 389 NOT_IN_MULTIBYTE_CHAR_CLASS; 390 } 391 /* leave space for <class_length> */ 392 compilep += 2; 393 } 394 } else { 395 regexp += char_size; 396 if (!multibyte) { 397 *compilep = (unsigned char) 398 IN_ASCII_CHAR_CLASS; 399 } else { 400 *compilep = (unsigned char) 401 IN_MULTIBYTE_CHAR_CLASS; 402 } 403 /* leave space for <class_length> */ 404 compilep += 2; 405 } 406 407 /* COMPILE THE CLASS */ 408 /* 409 * check for a leading right square bracket, 410 * which is allowed 411 */ 412 413 if (current_char == RIGHT_SQUARE_BRACKET) { 414 /* 415 * the leading RIGHT_SQUARE_BRACKET may 416 * be part of a character range 417 * expression like "[]-\]" 418 */ 419 dash_indicates_range = B_TRUE; 420 first_char_in_range = current_char; 421 char_size = get_wchar(¤t_char, regexp); 422 if (char_size <= 0) { 423 ERROR_EXIT(®cmp_lock, 424 arg_listp, compile_startp); 425 } else { 426 regexp += char_size; 427 *compilep = RIGHT_SQUARE_BRACKET; 428 compilep++; 429 } 430 } else { 431 /* 432 * decode the character in the following 433 * while loop and decide then if it can 434 * be the first character 435 * in a character range expression 436 */ 437 dash_indicates_range = B_FALSE; 438 } 439 440 while (current_char != RIGHT_SQUARE_BRACKET) { 441 if (current_char != DASH) { 442 /* 443 * if a DASH follows current_char, 444 * current_char, the DASH and the 445 * character that follows the DASH 446 * may form a character range 447 * expression 448 */ 449 dash_indicates_range = B_TRUE; 450 first_char_in_range = current_char; 451 expr_length = add_char( 452 compilep, current_char); 453 compilep += expr_length; 454 455 } else if /* (current_char == DASH) && */ 456 (dash_indicates_range == B_FALSE) { 457 /* 458 * current_char is a DASH, but 459 * either begins the entire 460 * character class or follows a 461 * character that's already 462 * part of a character range 463 * expression, so it simply 464 * represents the DASH character 465 * itself 466 */ 467 *compilep = DASH; 468 compilep ++; 469 /* 470 * if another DASH follows this 471 * one, this DASH is part 472 * of a character range expression 473 * like "[--\]" 474 */ 475 dash_indicates_range = B_TRUE; 476 first_char_in_range = current_char; 477 478 } else { 479 /* 480 * ((current_char == DASH &&/ 481 * (dash_indicates_range == B_TRUE)) 482 */ 483 484 /* 485 * the DASH appears after a single 486 * character that isn't 487 * already part of a character 488 * range expression, so it 489 * and the characters preceding 490 * and following it can form a 491 * character range expression 492 * like "[a-z]" 493 */ 494 char_size = get_wchar( 495 ¤t_char, regexp); 496 if (char_size <= 0) { 497 ERROR_EXIT(®cmp_lock, 498 arg_listp, compile_startp); 499 500 } else if (current_char == 501 RIGHT_SQUARE_BRACKET) { 502 /* 503 * the preceding DASH is 504 * the last character in the 505 * class and represents the 506 * DASH character itself 507 */ 508 *compilep = DASH; 509 compilep++; 510 511 } else if (valid_range( 512 first_char_in_range, 513 current_char) == B_FALSE) { 514 ERROR_EXIT(®cmp_lock, 515 arg_listp, compile_startp); 516 } else { 517 /* 518 * the DASH is part of a 519 * character range 520 * expression; encode the 521 * rest of the expression 522 */ 523 regexp += char_size; 524 *compilep = (unsigned char) 525 THRU; 526 compilep++; 527 expr_length = add_char( 528 compilep, current_char); 529 compilep += expr_length; 530 /* 531 * if a DASH follows this 532 * character range 533 * expression, 534 * it represents the DASH 535 * character itself 536 */ 537 dash_indicates_range = 538 B_FALSE; 539 } 540 } 541 542 /* GET THE NEXT CHARACTER */ 543 544 char_size = get_wchar(¤t_char, regexp); 545 if (char_size <= 0) { 546 ERROR_EXIT(®cmp_lock, 547 arg_listp, compile_startp); 548 } else { 549 regexp += char_size; 550 } 551 552 } 553 /* end while (current_char != RIGHT_SQUARE_BRACKET) */ 554 555 /* INSERT THE LENGTH OF THE CLASS INTO THE */ 556 /* COMPILED EXPRESSION */ 557 558 class_length = (unsigned int) 559 (compilep - regex_typep - 1); 560 if ((class_length < 2) || 561 (class_length > MAX_SINGLE_BYTE_INT)) { 562 ERROR_EXIT(®cmp_lock, arg_listp, 563 compile_startp); 564 } else { 565 *(regex_typep + 1) = (unsigned char) 566 class_length; 567 } 568 break; /* end case LEFT_SQUARE_BRACKET */ 569 570 case LEFT_PAREN: 571 572 /* 573 * start of a parenthesized group of regular 574 * expressions compiles to <'\0'><'\0'>, leaving 575 * space in the compiled regular expression for 576 * <group_type|ADDED_LENGTH_BITS><group_length> 577 */ 578 579 if (push_compilep(compilep) == (char *)0) { 580 /* 581 * groups can contain groups, so group 582 * start pointers 583 * must be saved and restored in sequence 584 */ 585 ERROR_EXIT(®cmp_lock, arg_listp, 586 compile_startp); 587 } else { 588 can_repeat = B_FALSE; 589 *compilep = '\0'; /* for debugging */ 590 compilep++; 591 *compilep = '\0'; /* for debugging */ 592 compilep++; 593 } 594 break; /* end case LEFT_PAREN */ 595 596 case RIGHT_PAREN: 597 /* end of a marked group of regular expressions */ 598 599 /* 600 * (<regex>)$0-9 compiles to 601 * <SAVED_GROUP><substringn><compiled_regex...>\ 602 * <END_SAVED_GROUP><substringn><return_arg_number> 603 * (<regex>)* compiles to 604 * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS> 605 * <group_length> <compiled_regex...> 606 * <END_GROUP|ZERO_OR_MORE><groupn> 607 * (<regex>)+ compiles to 608 * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS> 609 * <group_length>\ 610 * <compiled_regex...><END_GROUP|ONE_OR_MORE> 611 * <groupn> 612 * (<regex>){...} compiles to 613 * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\ 614 * <compiled_regex...><END_GROUP|COUNT><groupn>\ 615 * <minimum_repeat_count><maximum_repeat_count> 616 * otherwise (<regex>) compiles to 617 * <SIMPLE_GROUP><blank><compiled_regex...> 618 * <END_GROUP><groupn> 619 * 620 * NOTE: 621 * 622 * group_length + (256 * ADDED_LENGTH_BITS) == 623 * length_of(<compiled_regex...><END_GROUP|...> 624 * <groupn>) 625 * which also == 626 * length_of(<group_type|ADDED_LENGTH_BITS> 627 * <group_length>\ <compiled_regex...>) 628 * groupn no longer seems to be used, but the code 629 * still computes it to preserve backward 630 * compatibility 631 * with earlier versions of regex(). 632 */ 633 634 /* RETRIEVE THE ADDRESS OF THE START OF THE GROUP */ 635 636 regex_typep = pop_compilep(); 637 if (regex_typep == (char *)0) { 638 ERROR_EXIT(®cmp_lock, arg_listp, 639 compile_startp); 640 } 641 char_size = get_wchar(¤t_char, regexp); 642 if (char_size < 0) { 643 ERROR_EXIT(®cmp_lock, arg_listp, 644 compile_startp); 645 } else if (char_size == 0) { 646 *regex_typep = SIMPLE_GROUP; 647 can_repeat = B_TRUE; 648 *compilep = (unsigned char)END_GROUP; 649 regex_typep = compilep; 650 compilep++; 651 *compilep = (unsigned char)groupn; 652 groupn++; 653 compilep++; 654 } else if (current_char == DOLLAR_SIGN) { 655 *regex_typep = SAVED_GROUP; 656 regex_typep++; 657 *regex_typep = (char)substringn; 658 can_repeat = B_FALSE; 659 regexp ++; 660 return_arg_number = get_digit(regexp); 661 if ((return_arg_number < 0) || 662 (substringn >= NSUBSTRINGS)) { 663 ERROR_EXIT(®cmp_lock, arg_listp, 664 compile_startp); 665 } 666 regexp++; 667 *compilep = (unsigned char)END_SAVED_GROUP; 668 compilep++; 669 *compilep = (unsigned char)substringn; 670 substringn++; 671 compilep++; 672 *compilep = (unsigned char)return_arg_number; 673 compilep++; 674 } else { 675 switch (current_char) { 676 case STAR: 677 *regex_typep = ZERO_OR_MORE_GROUP; 678 break; 679 case PLUS: 680 *regex_typep = ONE_OR_MORE_GROUP; 681 break; 682 case LEFT_CURLY_BRACE: 683 *regex_typep = COUNTED_GROUP; 684 break; 685 default: 686 *regex_typep = SIMPLE_GROUP; 687 } 688 if (*regex_typep != SIMPLE_GROUP) { 689 group_length = (unsigned int) 690 (compilep - regex_typep); 691 if (group_length >= 1024) { 692 ERROR_EXIT(®cmp_lock, 693 arg_listp, compile_startp); 694 } 695 high_bits = group_length >> 696 TIMES_256_SHIFT; 697 low_bits = group_length & 698 SINGLE_BYTE_MASK; 699 *regex_typep = 700 (unsigned char) 701 ((unsigned int) 702 *regex_typep | high_bits); 703 regex_typep++; 704 *regex_typep = 705 (unsigned char)low_bits; 706 } 707 can_repeat = B_TRUE; 708 *compilep = (unsigned char)END_GROUP; 709 regex_typep = compilep; 710 compilep++; 711 *compilep = (unsigned char)groupn; 712 groupn++; 713 compilep++; 714 } 715 716 break; /* end case RIGHT_PAREN */ 717 718 case STAR: /* zero or more repetitions of the */ 719 /* preceding expression */ 720 721 /* 722 * <regex...>* compiles to <regex_type|ZERO_OR_MORE>\ 723 * <compiled_regex...> 724 * (<regex...>)* compiles to 725 * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>\ 726 * <group_length><compiled_regex...>\ 727 * <END_GROUP|ZERO_OR_MORE><groupn> 728 */ 729 730 if (can_repeat == B_FALSE) { 731 ERROR_EXIT(®cmp_lock, arg_listp, 732 compile_startp); 733 } else { 734 can_repeat = B_FALSE; 735 *regex_typep = (unsigned char) 736 ((unsigned int)*regex_typep | ZERO_OR_MORE); 737 } 738 break; /* end case '*' */ 739 740 case PLUS: 741 /* one or more repetitions of the preceding */ 742 /* expression */ 743 744 /* 745 * <regex...>+ compiles to <regex_type|ONE_OR_MORE>\ 746 * <compiled_regex...> (<regex...>)+ compiles to 747 * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>\ 748 * <group_length><compiled_regex...>\ 749 * <END_GROUP|ONE_OR_MORE><groupn> 750 */ 751 752 if (can_repeat == B_FALSE) { 753 ERROR_EXIT(®cmp_lock, arg_listp, 754 compile_startp); 755 } else { 756 can_repeat = B_FALSE; 757 *regex_typep = 758 (unsigned char)((unsigned int)* 759 regex_typep | ONE_OR_MORE); 760 } 761 break; /* end case '+' */ 762 763 case LEFT_CURLY_BRACE: 764 765 /* 766 * repeat the preceding regular expression 767 * at least min_count times 768 * and at most max_count times 769 * 770 * <regex...>{min_count} compiles to 771 * <regex type|COUNT><compiled_regex...> 772 * <min_count><min_count> 773 * 774 * <regex...>{min_count,} compiles to 775 * <regex type|COUNT><compiled_regex...> 776 * <min_count><UNLIMITED> 777 * 778 * <regex...>{min_count,max_count} compiles to 779 * <regex type>|COUNT><compiled_regex...> 780 * <min_count><max_count> 781 * 782 * (<regex...>){min_count,max_count} compiles to 783 * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\ 784 * <compiled_regex...><END_GROUP|COUNT><groupn>\ 785 * <minimum_match_count><maximum_match_count> 786 */ 787 788 if (can_repeat == B_FALSE) { 789 ERROR_EXIT(®cmp_lock, arg_listp, 790 compile_startp); 791 } 792 can_repeat = B_FALSE; 793 *regex_typep = (unsigned char)((unsigned int)* 794 regex_typep | COUNT); 795 count_length = get_count(&min_count, regexp); 796 if (count_length <= 0) { 797 ERROR_EXIT(®cmp_lock, arg_listp, 798 compile_startp); 799 } 800 regexp += count_length; 801 802 if (*regexp == RIGHT_CURLY_BRACE) { /* {min_count} */ 803 regexp++; 804 max_count = min_count; 805 } else if (*regexp == COMMA) { /* {min_count,..} */ 806 regexp++; 807 /* {min_count,} */ 808 if (*regexp == RIGHT_CURLY_BRACE) { 809 regexp++; 810 max_count = UNLIMITED; 811 } else { /* {min_count,max_count} */ 812 count_length = get_count( 813 &max_count, regexp); 814 if (count_length <= 0) { 815 ERROR_EXIT(®cmp_lock, 816 arg_listp, compile_startp); 817 } 818 regexp += count_length; 819 if (*regexp != RIGHT_CURLY_BRACE) { 820 ERROR_EXIT(®cmp_lock, 821 arg_listp, compile_startp); 822 } 823 regexp++; 824 } 825 } else { /* invalid expression */ 826 ERROR_EXIT(®cmp_lock, arg_listp, 827 compile_startp); 828 } 829 830 if ((min_count > MAX_SINGLE_BYTE_INT) || 831 ((max_count != UNLIMITED) && 832 (min_count > max_count))) { 833 ERROR_EXIT(®cmp_lock, arg_listp, 834 compile_startp); 835 } else { 836 *compilep = (unsigned char)min_count; 837 compilep++; 838 *compilep = (unsigned char)max_count; 839 compilep++; 840 } 841 break; /* end case LEFT_CURLY_BRACE */ 842 843 default: /* a single non-special character */ 844 845 /* 846 * compiles to <ASCII_CHAR><ascii_char> or 847 * <MULTIBYTE_CHAR><multibyte_char> 848 */ 849 850 can_repeat = B_TRUE; 851 regex_typep = compilep; 852 expr_length = add_single_char_expr(compilep, 853 current_char); 854 compilep += expr_length; 855 856 } /* end switch (current_char) */ 857 858 /* GET THE NEXT CHARACTER FOR THE WHILE LOOP */ 859 860 char_size = get_wchar(¤t_char, regexp); 861 if (char_size < 0) { 862 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); 863 } else if (char_size > 0) { 864 regexp += char_size; 865 } else if /* (char_size == 0) && */ (next_argp != (char *)0) { 866 regexp = next_argp; 867 next_argp = va_arg(arg_listp, /* const */ char *); 868 char_size = get_wchar(¤t_char, regexp); 869 if (char_size <= 0) { 870 ERROR_EXIT(®cmp_lock, arg_listp, 871 compile_startp); 872 } else { 873 regexp += char_size; 874 } 875 } else /* ((char_size == 0) && (next_argp == (char *)0)) */ { 876 if (pop_compilep() != (char *)0) { 877 /* unmatched parentheses */ 878 ERROR_EXIT(®cmp_lock, arg_listp, 879 compile_startp); 880 } 881 *compilep = (unsigned char)END_REGEX; 882 compilep++; 883 *compilep = '\0'; 884 compilep++; 885 __i_size = (int)(compilep - compile_startp); 886 va_end(arg_listp); 887 lmutex_unlock(®cmp_lock); 888 return (compile_startp); 889 } 890 } /* end for (;;) */ 891 892 } /* regcmp() */ 893 894 895 /* DEFINITIONS OF PRIVATE FUNCTIONS */ 896 897 static int 898 add_char(char *compilep, wchar_t wchar) 899 { 900 int expr_length; 901 902 if ((unsigned int)wchar <= (unsigned int)0x7f) { 903 *compilep = (unsigned char)wchar; 904 expr_length = 1; 905 } else { 906 expr_length = wctomb(compilep, wchar); 907 } 908 return (expr_length); 909 } 910 911 static int 912 add_single_char_expr(char *compilep, wchar_t wchar) 913 { 914 int expr_length = 0; 915 916 if ((unsigned int)wchar <= (unsigned int)0x7f) { 917 *compilep = (unsigned char)ASCII_CHAR; 918 compilep++; 919 *compilep = (unsigned char)wchar; 920 expr_length += 2; 921 } else { 922 *compilep = (unsigned char)MULTIBYTE_CHAR; 923 compilep++; 924 expr_length++; 925 expr_length += wctomb(compilep, wchar); 926 } 927 return (expr_length); 928 } 929 930 static int 931 get_count(int *countp, const char *regexp) 932 { 933 char count_char = '0'; 934 int count = 0; 935 int count_length = 0; 936 937 if (regexp == (char *)0) { 938 return ((int)0); 939 } else { 940 count_char = *regexp; 941 while (('0' <= count_char) && (count_char <= '9')) { 942 count = (10 * count) + (int)(count_char - '0'); 943 count_length++; 944 regexp++; 945 count_char = *regexp; 946 } 947 } 948 *countp = count; 949 return (count_length); 950 } 951 952 static int 953 get_digit(const char *regexp) 954 { 955 char digit; 956 957 if (regexp == (char *)0) { 958 return ((int)-1); 959 } else { 960 digit = *regexp; 961 if (('0' <= digit) && (digit <= '9')) { 962 return ((int)(digit - '0')); 963 } else { 964 return ((int)-1); 965 } 966 } 967 } 968 969 static int 970 get_wchar(wchar_t *wcharp, const char *regexp) 971 { 972 int char_size; 973 974 if (regexp == (char *)0) { 975 char_size = 0; 976 *wcharp = (wchar_t)((unsigned int)'\0'); 977 } else if (*regexp == '\0') { 978 char_size = 0; 979 *wcharp = (wchar_t)((unsigned int)*regexp); 980 } else if ((unsigned char)*regexp <= (unsigned char)0x7f) { 981 char_size = 1; 982 *wcharp = (wchar_t)((unsigned int)*regexp); 983 } else { 984 char_size = mbtowc(wcharp, regexp, MB_LEN_MAX); 985 } 986 return (char_size); 987 } 988 989 static char * 990 pop_compilep(void) 991 { 992 char *compilep; 993 994 if (compilep_stackp >= &compilep_stack[STRINGP_STACK_SIZE]) { 995 return ((char *)0); 996 } else { 997 compilep = *compilep_stackp; 998 compilep_stackp++; 999 return (compilep); 1000 } 1001 } 1002 1003 static char * 1004 push_compilep(char *compilep) 1005 { 1006 if (compilep_stackp <= &compilep_stack[0]) { 1007 return ((char *)0); 1008 } else { 1009 compilep_stackp--; 1010 *compilep_stackp = compilep; 1011 return (compilep); 1012 } 1013 } 1014 1015 static boolean_t 1016 valid_range(wchar_t lower_char, wchar_t upper_char) 1017 { 1018 return (((lower_char <= 0x7f) && (upper_char <= 0x7f) && 1019 !iswcntrl(lower_char) && !iswcntrl(upper_char) && 1020 (lower_char < upper_char)) || 1021 (((lower_char & WCHAR_CSMASK) == 1022 (upper_char & WCHAR_CSMASK)) && 1023 (lower_char < upper_char))); 1024 } 1025