1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 #pragma ident "%Z%%M% %I% %E% SMI" 31 32 /* 33 * IMPORTANT NOTE: 34 * 35 * regcmp() WORKS **ONLY** WITH THE ASCII AND THE Solaris EUC CHARACTER SETS. 36 * IT IS **NOT** CHARACTER SET INDEPENDENT. 37 * 38 */ 39 40 #pragma weak regcmp = _regcmp 41 42 #include "synonyms.h" 43 #include "mtlib.h" 44 #include <limits.h> 45 #include <stdarg.h> 46 #include <stdlib.h> 47 #include <thread.h> 48 #include <wctype.h> 49 #include <widec.h> 50 #include <string.h> 51 #include "tsd.h" 52 53 54 /* CONSTANTS SHARED WITH regex() */ 55 56 #include "regex.h" 57 58 /* PRIVATE CONSTANTS */ 59 60 #define BACKSLASH '\\' 61 #define CIRCUMFLEX '^' 62 #define COMMA ',' 63 #define DASH '-' 64 #define DOLLAR_SIGN '$' 65 #define DOT '.' 66 #define LEFT_CURLY_BRACE '{' 67 #define LEFT_PAREN '(' 68 #define LEFT_SQUARE_BRACKET '[' 69 #define PLUS '+' 70 #define RIGHT_CURLY_BRACE '}' 71 #define RIGHT_PAREN ')' 72 #define RIGHT_SQUARE_BRACKET ']' 73 #define SINGLE_BYTE_MASK 0xff 74 #define STRINGP_STACK_SIZE 50 75 #define STAR '*' 76 77 /* PRIVATE GLOBAL VARIABLES */ 78 79 static char *compilep_stack[STRINGP_STACK_SIZE]; 80 static char **compilep_stackp; 81 static mutex_t regcmp_lock = DEFAULTMUTEX; 82 83 /* DECLARATIONS OF PRIVATE FUNCTIONS */ 84 85 static int add_char(char *compilep, wchar_t wchar); 86 static int add_single_char_expr(char *compilep, wchar_t wchar); 87 88 #define ERROR_EXIT(mutex_lockp, arg_listp, compile_startp) \ 89 \ 90 va_end(arg_listp); \ 91 lmutex_unlock(mutex_lockp); \ 92 if ((compile_startp) != (char *)0) \ 93 free((void *)compile_startp); \ 94 return ((char *)0) 95 96 static int get_count(int *countp, const char *regexp); 97 static int get_digit(const char *regexp); 98 static int get_wchar(wchar_t *wchar, const char *regexp); 99 static char *pop_compilep(void); 100 static char *push_compilep(char *compilep); 101 static boolean_t valid_range(wchar_t lower_char, wchar_t upper_char); 102 103 104 /* DEFINITIONS OF PUBLIC VARIABLES */ 105 106 int __i_size; 107 108 /* 109 * define thread-specific storage for __i_size 110 * 111 */ 112 int * 113 ___i_size(void) 114 { 115 if (_thr_main()) 116 return (&__i_size); 117 return ((int *)tsdalloc(_T_REGCMP_ISIZE, sizeof (int), NULL)); 118 } 119 120 #define __i_size (*(___i_size())) 121 122 /* DEFINITION OF regcmp() */ 123 124 extern char * 125 regcmp(const char *regexp, ...) 126 { 127 va_list arg_listp; 128 size_t arg_strlen; 129 boolean_t can_repeat; 130 int char_size; 131 unsigned int class_length; 132 char *compilep; 133 char *compile_startp = (char *)0; 134 int count_length; 135 wchar_t current_char; 136 int expr_length; 137 int groupn; 138 unsigned int group_length; 139 unsigned int high_bits; 140 boolean_t dash_indicates_range; 141 unsigned int low_bits; 142 int max_count; 143 int min_count; 144 const char *next_argp; 145 wchar_t first_char_in_range; 146 char *regex_typep; 147 int return_arg_number; 148 int substringn; 149 150 if (___i_size() == (int *)0) 151 return ((char *)0); 152 153 /* 154 * When compiling a regular expression, regcmp() generates at most 155 * two extra single-byte characters for each character in the 156 * expression, so allocating three times the number of bytes in all 157 * the strings that comprise the regular expression will ensure that 158 * regcmp() won't overwrite the end of the allocated block when 159 * compiling the expression. 160 */ 161 162 va_start(arg_listp, regexp); 163 next_argp = regexp; 164 arg_strlen = 0; 165 while (next_argp != (char *)0) { 166 arg_strlen += strlen(next_argp); 167 next_argp = va_arg(arg_listp, /* const */ char *); 168 } 169 va_end(arg_listp); 170 171 if (arg_strlen == 0) 172 return ((char *)0); 173 compile_startp = (char *)malloc(3 * arg_strlen); 174 if (compile_startp == (char *)0) 175 return ((char *)0); 176 177 lmutex_lock(®cmp_lock); 178 __i_size = 0; 179 compilep = compile_startp; 180 compilep_stackp = &compilep_stack[STRINGP_STACK_SIZE]; 181 182 /* GET THE FIRST CHARACTER IN THE REGULAR EXPRESSION */ 183 va_start(arg_listp, regexp); 184 next_argp = va_arg(arg_listp, /* const */ char *); 185 char_size = get_wchar(¤t_char, regexp); 186 if (char_size < 0) { 187 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); 188 } else if (char_size > 0) { 189 regexp += char_size; 190 } else /* (char_size == 0 ) */ { 191 regexp = next_argp; 192 next_argp = va_arg(arg_listp, /* const */ char *); 193 char_size = get_wchar(¤t_char, regexp); 194 if (char_size <= 0) { 195 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); 196 } else { 197 regexp += char_size; 198 } 199 } 200 201 /* FIND OUT IF THE EXPRESSION MUST START AT THE START OF A STRING */ 202 203 if (current_char == CIRCUMFLEX) { 204 char_size = get_wchar(¤t_char, regexp); 205 if (char_size < 0) { 206 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); 207 } else if (char_size > 0) { 208 regexp += char_size; 209 *compilep = (unsigned char)START_OF_STRING_MARK; 210 compilep++; 211 } else if /* (char_size == 0) && */ (next_argp != (char *)0) { 212 regexp = next_argp; 213 next_argp = va_arg(arg_listp, /* const */ char *); 214 char_size = get_wchar(¤t_char, regexp); 215 if (char_size <= 0) { 216 ERROR_EXIT(®cmp_lock, arg_listp, 217 compile_startp); 218 } else { 219 regexp += char_size; 220 } 221 *compilep = (unsigned char)START_OF_STRING_MARK; 222 compilep++; 223 } else { 224 /* ((char_size==0) && (next_argp==(char *)0)) */ 225 /* 226 * the regular expression is "^" 227 */ 228 *compilep = (unsigned char)START_OF_STRING_MARK; 229 compilep++; 230 *compilep = (unsigned char)END_REGEX; 231 compilep++; 232 *compilep = '\0'; 233 compilep++; 234 __i_size = (int)(compilep - compile_startp); 235 va_end(arg_listp); 236 lmutex_unlock(®cmp_lock); 237 return (compile_startp); 238 } 239 } 240 241 /* COMPILE THE REGULAR EXPRESSION */ 242 243 groupn = 0; 244 substringn = 0; 245 can_repeat = B_FALSE; 246 for (;;) { 247 248 /* 249 * At the end of each iteration get the next character 250 * from the regular expression and increment regexp to 251 * point to the following character. Exit when all 252 * the characters in all the strings in the argument 253 * list have been read. 254 */ 255 256 switch (current_char) { 257 258 /* 259 * No fall-through. Each case ends with either 260 * a break or an error exit. Each case starts 261 * with compilep addressing the next location to 262 * be written in the compiled regular expression, 263 * and with regexp addressing the next character 264 * to be read from the regular expression being 265 * compiled. Each case that doesn't return 266 * increments regexp to address the next character 267 * to be read from the regular expression and 268 * increments compilep to address the next 269 * location to be written in the compiled 270 * regular expression. 271 * 272 * NOTE: The comments for each case give the meaning 273 * of the regular expression compiled by the case 274 * and the character string written to the compiled 275 * regular expression by the case. Each single 276 * character 277 * written to the compiled regular expression is 278 * shown enclosed in angle brackets (<>). Each 279 * compiled regular expression begins with a marker 280 * character which is shown as a named constant 281 * (e.g. <ASCII_CHAR>). Character constants are 282 * shown enclosed in single quotes (e.g. <'$'>). 283 * All other single characters written to the 284 * compiled regular expression are shown as lower 285 * case variable names (e.g. <ascii_char> or 286 * <multibyte_char>). Multicharacter 287 * strings written to the compiled regular expression 288 * are shown as variable names followed by elipses 289 * (e.g. <regex...>). 290 */ 291 292 case DOLLAR_SIGN: 293 /* end of string marker or simple dollar sign */ 294 /* compiles to <END_OF_STRING_MARK> or */ 295 /* <ASCII_CHAR><'$'> */ 296 297 char_size = get_wchar(¤t_char, regexp); 298 if ((char_size == 0) && (next_argp == (char *)0)) { 299 can_repeat = B_FALSE; 300 *compilep = (unsigned char)END_OF_STRING_MARK; 301 compilep++; 302 } else { 303 can_repeat = B_TRUE; 304 *compilep = (unsigned char)ASCII_CHAR; 305 regex_typep = compilep; 306 compilep++; 307 *compilep = DOLLAR_SIGN; 308 compilep++; 309 } 310 break; /* end case DOLLAR_SIGN */ 311 312 case DOT: /* any character */ 313 314 /* compiles to <ANY_CHAR> */ 315 316 can_repeat = B_TRUE; 317 *compilep = (unsigned char)ANY_CHAR; 318 regex_typep = compilep; 319 compilep++; 320 321 break; /* end case DOT */ 322 323 case BACKSLASH: /* escaped character */ 324 325 /* 326 * compiles to <ASCII_CHAR><ascii_char> or 327 * <MULTIBYTE_CHAR><multibyte_char> 328 */ 329 330 char_size = get_wchar(¤t_char, regexp); 331 if (char_size <= 0) { 332 ERROR_EXIT(®cmp_lock, arg_listp, 333 compile_startp); 334 } else { 335 regexp += char_size; 336 can_repeat = B_TRUE; 337 expr_length = add_single_char_expr( 338 compilep, current_char); 339 regex_typep = compilep; 340 compilep += expr_length; 341 } 342 break; /* end case '\\' */ 343 344 case LEFT_SQUARE_BRACKET: 345 /* start of a character class expression */ 346 347 /* 348 * [^...c...] compiles to 349 * <NOT_IN_CLASS><class_length><...c...> 350 * [^...a-z...] compiles to 351 * <NOT_IN_CLASS><class_length><...a<THRU>z...> 352 * [...c...] compiles to 353 * <IN_CLASS><class_length><...c...> 354 * [...a-z...] compiles to 355 * <IN_CLASS><class_length><...a<THRU>z...> 356 * 357 * NOTE: <class_length> includes the 358 * <class_length> byte 359 */ 360 361 can_repeat = B_TRUE; 362 regex_typep = compilep; 363 364 /* DETERMINE THE CLASS TYPE */ 365 366 /* 367 * NOTE: This algorithm checks the value of the 368 * "multibyte" 369 * macro in <euc.h> (included in <widec.h> ) 370 * to find out if regcmp() 371 * is compiling the regular expression in a 372 * multibyte locale. 373 */ 374 char_size = get_wchar(¤t_char, regexp); 375 if (char_size <= 0) { 376 ERROR_EXIT(®cmp_lock, arg_listp, 377 compile_startp); 378 } else if (current_char == CIRCUMFLEX) { 379 regexp++; 380 char_size = get_wchar(¤t_char, regexp); 381 if (char_size <= 0) { 382 ERROR_EXIT(®cmp_lock, 383 arg_listp, compile_startp); 384 } else { 385 regexp += char_size; 386 if (!multibyte) { 387 *compilep = (unsigned char) 388 NOT_IN_ASCII_CHAR_CLASS; 389 } else { 390 *compilep = (unsigned char) 391 NOT_IN_MULTIBYTE_CHAR_CLASS; 392 } 393 /* leave space for <class_length> */ 394 compilep += 2; 395 } 396 } else { 397 regexp += char_size; 398 if (!multibyte) { 399 *compilep = (unsigned char) 400 IN_ASCII_CHAR_CLASS; 401 } else { 402 *compilep = (unsigned char) 403 IN_MULTIBYTE_CHAR_CLASS; 404 } 405 /* leave space for <class_length> */ 406 compilep += 2; 407 } 408 409 /* COMPILE THE CLASS */ 410 /* 411 * check for a leading right square bracket, 412 * which is allowed 413 */ 414 415 if (current_char == RIGHT_SQUARE_BRACKET) { 416 /* 417 * the leading RIGHT_SQUARE_BRACKET may 418 * be part of a character range 419 * expression like "[]-\]" 420 */ 421 dash_indicates_range = B_TRUE; 422 first_char_in_range = current_char; 423 char_size = get_wchar(¤t_char, regexp); 424 if (char_size <= 0) { 425 ERROR_EXIT(®cmp_lock, 426 arg_listp, compile_startp); 427 } else { 428 regexp += char_size; 429 *compilep = RIGHT_SQUARE_BRACKET; 430 compilep++; 431 } 432 } else { 433 /* 434 * decode the character in the following 435 * while loop and decide then if it can 436 * be the first character 437 * in a character range expression 438 */ 439 dash_indicates_range = B_FALSE; 440 } 441 442 while (current_char != RIGHT_SQUARE_BRACKET) { 443 if (current_char != DASH) { 444 /* 445 * if a DASH follows current_char, 446 * current_char, the DASH and the 447 * character that follows the DASH 448 * may form a character range 449 * expression 450 */ 451 dash_indicates_range = B_TRUE; 452 first_char_in_range = current_char; 453 expr_length = add_char( 454 compilep, current_char); 455 compilep += expr_length; 456 457 } else if /* (current_char == DASH) && */ 458 (dash_indicates_range == B_FALSE) { 459 /* 460 * current_char is a DASH, but 461 * either begins the entire 462 * character class or follows a 463 * character that's already 464 * part of a character range 465 * expression, so it simply 466 * represents the DASH character 467 * itself 468 */ 469 *compilep = DASH; 470 compilep ++; 471 /* 472 * if another DASH follows this 473 * one, this DASH is part 474 * of a character range expression 475 * like "[--\]" 476 */ 477 dash_indicates_range = B_TRUE; 478 first_char_in_range = current_char; 479 480 } else /* ((current_char == DASH && */ 481 /* (dash_indicates_range == B_TRUE)) */ { 482 /* 483 * the DASH appears after a single 484 * character that isn't 485 * already part of a character 486 * range expression, so it 487 * and the characters preceding 488 * and following it can form a 489 * character range expression 490 * like "[a-z]" 491 */ 492 char_size = get_wchar( 493 ¤t_char, regexp); 494 if (char_size <= 0) { 495 ERROR_EXIT(®cmp_lock, 496 arg_listp, compile_startp); 497 498 } else if (current_char == 499 RIGHT_SQUARE_BRACKET) { 500 /* 501 * the preceding DASH is 502 * the last character in the 503 * class and represents the 504 * DASH character itself 505 */ 506 *compilep = DASH; 507 compilep++; 508 509 } else if (valid_range( 510 first_char_in_range, 511 current_char) == B_FALSE) { 512 513 ERROR_EXIT(®cmp_lock, 514 arg_listp, compile_startp); 515 516 } else { 517 /* 518 * the DASH is part of a 519 * character range 520 * expression; encode the 521 * rest of the expression 522 */ 523 regexp += char_size; 524 *compilep = (unsigned char) 525 THRU; 526 compilep++; 527 expr_length = add_char( 528 compilep, current_char); 529 compilep += expr_length; 530 /* 531 * if a DASH follows this 532 * character range 533 * expression, 534 * it represents the DASH 535 * character itself 536 */ 537 dash_indicates_range = 538 B_FALSE; 539 } 540 } 541 542 /* GET THE NEXT CHARACTER */ 543 544 char_size = get_wchar(¤t_char, regexp); 545 if (char_size <= 0) { 546 ERROR_EXIT(®cmp_lock, 547 arg_listp, compile_startp); 548 } else { 549 regexp += char_size; 550 } 551 552 } 553 /* end while (current_char != RIGHT_SQUARE_BRACKET) */ 554 555 /* INSERT THE LENGTH OF THE CLASS INTO THE */ 556 /* COMPILED EXPRESSION */ 557 558 class_length = (unsigned int) 559 (compilep - regex_typep - 1); 560 if ((class_length < 2) || 561 (class_length > MAX_SINGLE_BYTE_INT)) { 562 ERROR_EXIT(®cmp_lock, arg_listp, 563 compile_startp); 564 } else { 565 *(regex_typep + 1) = (unsigned char) 566 class_length; 567 } 568 break; /* end case LEFT_SQUARE_BRACKET */ 569 570 case LEFT_PAREN: 571 572 /* 573 * start of a parenthesized group of regular 574 * expressions compiles to <'\0'><'\0'>, leaving 575 * space in the compiled regular expression for 576 * <group_type|ADDED_LENGTH_BITS><group_length> 577 */ 578 579 if (push_compilep(compilep) == (char *)0) { 580 /* 581 * groups can contain groups, so group 582 * start pointers 583 * must be saved and restored in sequence 584 */ 585 ERROR_EXIT(®cmp_lock, arg_listp, 586 compile_startp); 587 } else { 588 can_repeat = B_FALSE; 589 *compilep = '\0'; /* for debugging */ 590 compilep++; 591 *compilep = '\0'; /* for debugging */ 592 compilep++; 593 } 594 break; /* end case LEFT_PAREN */ 595 596 case RIGHT_PAREN: 597 /* end of a marked group of regular expressions */ 598 599 /* 600 * (<regex>)$0-9 compiles to 601 * <SAVED_GROUP><substringn><compiled_regex...>\ 602 * <END_SAVED_GROUP><substringn><return_arg_number> 603 * (<regex>)* compiles to 604 * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS> 605 * <group_length> <compiled_regex...> 606 * <END_GROUP|ZERO_OR_MORE><groupn> 607 * (<regex>)+ compiles to 608 * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS> 609 * <group_length>\ 610 * <compiled_regex...><END_GROUP|ONE_OR_MORE> 611 * <groupn> 612 * (<regex>){...} compiles to 613 * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\ 614 * <compiled_regex...><END_GROUP|COUNT><groupn>\ 615 * <minimum_repeat_count><maximum_repeat_count> 616 * otherwise (<regex>) compiles to 617 * <SIMPLE_GROUP><blank><compiled_regex...> 618 * <END_GROUP><groupn> 619 * 620 * NOTE: 621 * 622 * group_length + (256 * ADDED_LENGTH_BITS) == 623 * length_of(<compiled_regex...><END_GROUP|...> 624 * <groupn>) 625 * which also == 626 * length_of(<group_type|ADDED_LENGTH_BITS> 627 * <group_length>\ <compiled_regex...>) 628 * groupn no longer seems to be used, but the code 629 * still computes it to preserve backward 630 * compatibility 631 * with earlier versions of regex(). 632 */ 633 634 /* RETRIEVE THE ADDRESS OF THE START OF THE GROUP */ 635 636 regex_typep = pop_compilep(); 637 if (regex_typep == (char *)0) { 638 ERROR_EXIT(®cmp_lock, arg_listp, 639 compile_startp); 640 } 641 char_size = get_wchar(¤t_char, regexp); 642 if (char_size < 0) { 643 ERROR_EXIT(®cmp_lock, arg_listp, 644 compile_startp); 645 } else if (char_size == 0) { 646 *regex_typep = SIMPLE_GROUP; 647 can_repeat = B_TRUE; 648 *compilep = (unsigned char)END_GROUP; 649 regex_typep = compilep; 650 compilep++; 651 *compilep = (unsigned char)groupn; 652 groupn++; 653 compilep++; 654 } else if (current_char == DOLLAR_SIGN) { 655 *regex_typep = SAVED_GROUP; 656 regex_typep++; 657 *regex_typep = (char)substringn; 658 can_repeat = B_FALSE; 659 regexp ++; 660 return_arg_number = get_digit(regexp); 661 if ((return_arg_number < 0) || 662 (substringn >= NSUBSTRINGS)) { 663 ERROR_EXIT(®cmp_lock, arg_listp, 664 compile_startp); 665 } 666 regexp++; 667 *compilep = (unsigned char)END_SAVED_GROUP; 668 compilep++; 669 *compilep = (unsigned char)substringn; 670 substringn++; 671 compilep++; 672 *compilep = (unsigned char)return_arg_number; 673 compilep++; 674 } else { 675 switch (current_char) { 676 case STAR: 677 *regex_typep = ZERO_OR_MORE_GROUP; 678 break; 679 case PLUS: 680 *regex_typep = ONE_OR_MORE_GROUP; 681 break; 682 case LEFT_CURLY_BRACE: 683 *regex_typep = COUNTED_GROUP; 684 break; 685 default: 686 *regex_typep = SIMPLE_GROUP; 687 } 688 if (*regex_typep != SIMPLE_GROUP) { 689 group_length = (unsigned int) 690 (compilep - regex_typep); 691 if (group_length >= 1024) { 692 ERROR_EXIT(®cmp_lock, 693 arg_listp, compile_startp); 694 } 695 high_bits = group_length >> 696 TIMES_256_SHIFT; 697 low_bits = group_length & 698 SINGLE_BYTE_MASK; 699 *regex_typep = 700 (unsigned char) 701 ((unsigned int) 702 *regex_typep | high_bits); 703 regex_typep++; 704 *regex_typep = 705 (unsigned char)low_bits; 706 } 707 can_repeat = B_TRUE; 708 *compilep = (unsigned char)END_GROUP; 709 regex_typep = compilep; 710 compilep++; 711 *compilep = (unsigned char)groupn; 712 groupn++; 713 compilep++; 714 } 715 716 break; /* end case RIGHT_PAREN */ 717 718 case STAR: /* zero or more repetitions of the */ 719 /* preceding expression */ 720 721 /* 722 * <regex...>* compiles to <regex_type|ZERO_OR_MORE>\ 723 * <compiled_regex...> 724 * (<regex...>)* compiles to 725 * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>\ 726 * <group_length><compiled_regex...>\ 727 * <END_GROUP|ZERO_OR_MORE><groupn> 728 */ 729 730 if (can_repeat == B_FALSE) { 731 ERROR_EXIT(®cmp_lock, arg_listp, 732 compile_startp); 733 } else { 734 can_repeat = B_FALSE; 735 *regex_typep = (unsigned char) 736 ((unsigned int)*regex_typep | ZERO_OR_MORE); 737 } 738 break; /* end case '*' */ 739 740 case PLUS: 741 /* one or more repetitions of the preceding */ 742 /* expression */ 743 744 /* 745 * <regex...>+ compiles to <regex_type|ONE_OR_MORE>\ 746 * <compiled_regex...> (<regex...>)+ compiles to 747 * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>\ 748 * <group_length><compiled_regex...>\ 749 * <END_GROUP|ONE_OR_MORE><groupn> 750 */ 751 752 if (can_repeat == B_FALSE) { 753 ERROR_EXIT(®cmp_lock, arg_listp, 754 compile_startp); 755 } else { 756 can_repeat = B_FALSE; 757 *regex_typep = 758 (unsigned char)((unsigned int)* 759 regex_typep | ONE_OR_MORE); 760 } 761 break; /* end case '+' */ 762 763 case LEFT_CURLY_BRACE: 764 765 /* 766 * repeat the preceding regular expression 767 * at least min_count times 768 * and at most max_count times 769 * 770 * <regex...>{min_count} compiles to 771 * <regex type|COUNT><compiled_regex...> 772 * <min_count><min_count> 773 * 774 * <regex...>{min_count,} compiles to 775 * <regex type|COUNT><compiled_regex...> 776 * <min_count><UNLIMITED> 777 * 778 * <regex...>{min_count,max_count} compiles to 779 * <regex type>|COUNT><compiled_regex...> 780 * <min_count><max_count> 781 * 782 * (<regex...>){min_count,max_count} compiles to 783 * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\ 784 * <compiled_regex...><END_GROUP|COUNT><groupn>\ 785 * <minimum_match_count><maximum_match_count> 786 */ 787 788 if (can_repeat == B_FALSE) { 789 ERROR_EXIT(®cmp_lock, arg_listp, 790 compile_startp); 791 } 792 can_repeat = B_FALSE; 793 *regex_typep = (unsigned char)((unsigned int)* 794 regex_typep | COUNT); 795 count_length = get_count(&min_count, regexp); 796 if (count_length <= 0) { 797 ERROR_EXIT(®cmp_lock, arg_listp, 798 compile_startp); 799 } 800 regexp += count_length; 801 802 if (*regexp == RIGHT_CURLY_BRACE) { /* {min_count} */ 803 regexp++; 804 max_count = min_count; 805 } else if (*regexp == COMMA) { /* {min_count,..} */ 806 regexp++; 807 /* {min_count,} */ 808 if (*regexp == RIGHT_CURLY_BRACE) { 809 regexp++; 810 max_count = UNLIMITED; 811 } else { /* {min_count,max_count} */ 812 count_length = get_count( 813 &max_count, regexp); 814 if (count_length <= 0) { 815 ERROR_EXIT(®cmp_lock, 816 arg_listp, compile_startp); 817 } 818 regexp += count_length; 819 if (*regexp != RIGHT_CURLY_BRACE) { 820 ERROR_EXIT(®cmp_lock, 821 arg_listp, compile_startp); 822 } 823 regexp++; 824 } 825 } else { /* invalid expression */ 826 ERROR_EXIT(®cmp_lock, arg_listp, 827 compile_startp); 828 } 829 830 if ((min_count > MAX_SINGLE_BYTE_INT) || 831 ((max_count != UNLIMITED) && 832 (min_count > max_count))) { 833 ERROR_EXIT(®cmp_lock, arg_listp, 834 compile_startp); 835 } else { 836 *compilep = (unsigned char)min_count; 837 compilep++; 838 *compilep = (unsigned char)max_count; 839 compilep++; 840 } 841 break; /* end case LEFT_CURLY_BRACE */ 842 843 default: /* a single non-special character */ 844 845 /* 846 * compiles to <ASCII_CHAR><ascii_char> or 847 * <MULTIBYTE_CHAR><multibyte_char> 848 */ 849 850 can_repeat = B_TRUE; 851 regex_typep = compilep; 852 expr_length = add_single_char_expr(compilep, 853 current_char); 854 compilep += expr_length; 855 856 } /* end switch (current_char) */ 857 858 /* GET THE NEXT CHARACTER FOR THE WHILE LOOP */ 859 860 char_size = get_wchar(¤t_char, regexp); 861 if (char_size < 0) { 862 ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); 863 } else if (char_size > 0) { 864 regexp += char_size; 865 } else if /* (char_size == 0) && */ (next_argp != (char *)0) { 866 regexp = next_argp; 867 next_argp = va_arg(arg_listp, /* const */ char *); 868 char_size = get_wchar(¤t_char, regexp); 869 if (char_size <= 0) { 870 ERROR_EXIT(®cmp_lock, arg_listp, 871 compile_startp); 872 } else { 873 regexp += char_size; 874 } 875 } else /* ((char_size == 0) && (next_argp == (char *)0)) */ { 876 if (pop_compilep() != (char *)0) { 877 /* unmatched parentheses */ 878 ERROR_EXIT(®cmp_lock, arg_listp, 879 compile_startp); 880 } 881 *compilep = (unsigned char)END_REGEX; 882 compilep++; 883 *compilep = '\0'; 884 compilep++; 885 __i_size = (int)(compilep - compile_startp); 886 va_end(arg_listp); 887 lmutex_unlock(®cmp_lock); 888 return (compile_startp); 889 } 890 } /* end for (;;) */ 891 892 } /* regcmp() */ 893 894 895 /* DEFINITIONS OF PRIVATE FUNCTIONS */ 896 897 static int 898 add_char(char *compilep, wchar_t wchar) 899 { 900 int expr_length; 901 902 if ((unsigned int)wchar <= (unsigned int)0x7f) { 903 *compilep = (unsigned char)wchar; 904 expr_length = 1; 905 } else { 906 expr_length = wctomb(compilep, wchar); 907 } 908 return (expr_length); 909 } 910 911 static int 912 add_single_char_expr(char *compilep, wchar_t wchar) 913 { 914 int expr_length = 0; 915 916 if ((unsigned int)wchar <= (unsigned int)0x7f) { 917 *compilep = (unsigned char)ASCII_CHAR; 918 compilep++; 919 *compilep = (unsigned char)wchar; 920 expr_length += 2; 921 } else { 922 *compilep = (unsigned char)MULTIBYTE_CHAR; 923 compilep++; 924 expr_length++; 925 expr_length += wctomb(compilep, wchar); 926 } 927 return (expr_length); 928 } 929 930 static int 931 get_count(int *countp, const char *regexp) 932 { 933 char count_char = '0'; 934 int count = 0; 935 int count_length = 0; 936 937 if (regexp == (char *)0) { 938 return ((int)0); 939 } else { 940 count_char = *regexp; 941 while (('0' <= count_char) && (count_char <= '9')) { 942 count = (10 * count) + (int)(count_char - '0'); 943 count_length++; 944 regexp++; 945 count_char = *regexp; 946 } 947 } 948 *countp = count; 949 return (count_length); 950 } 951 952 static int 953 get_digit(const char *regexp) 954 { 955 char digit; 956 957 if (regexp == (char *)0) { 958 return ((int)-1); 959 } else { 960 digit = *regexp; 961 if (('0' <= digit) && (digit <= '9')) { 962 return ((int)(digit - '0')); 963 } else { 964 return ((int)-1); 965 } 966 } 967 } 968 969 static int 970 get_wchar(wchar_t *wcharp, const char *regexp) 971 { 972 int char_size; 973 974 if (regexp == (char *)0) { 975 char_size = 0; 976 *wcharp = (wchar_t)((unsigned int)'\0'); 977 } else if (*regexp == '\0') { 978 char_size = 0; 979 *wcharp = (wchar_t)((unsigned int)*regexp); 980 } else if ((unsigned char)*regexp <= (unsigned char)0x7f) { 981 char_size = 1; 982 *wcharp = (wchar_t)((unsigned int)*regexp); 983 } else { 984 char_size = mbtowc(wcharp, regexp, MB_LEN_MAX); 985 } 986 return (char_size); 987 } 988 989 static char * 990 pop_compilep(void) 991 { 992 char *compilep; 993 994 if (compilep_stackp >= &compilep_stack[STRINGP_STACK_SIZE]) { 995 return ((char *)0); 996 } else { 997 compilep = *compilep_stackp; 998 compilep_stackp++; 999 return (compilep); 1000 } 1001 } 1002 1003 static char * 1004 push_compilep(char *compilep) 1005 { 1006 if (compilep_stackp <= &compilep_stack[0]) { 1007 return ((char *)0); 1008 } else { 1009 compilep_stackp--; 1010 *compilep_stackp = compilep; 1011 return (compilep); 1012 } 1013 } 1014 1015 static boolean_t 1016 valid_range(wchar_t lower_char, wchar_t upper_char) 1017 { 1018 return (((lower_char <= 0x7f) && (upper_char <= 0x7f) && 1019 !iswcntrl(lower_char) && !iswcntrl(upper_char) && 1020 (lower_char < upper_char)) || 1021 (((lower_char & WCHAR_CSMASK) == 1022 (upper_char & WCHAR_CSMASK)) && 1023 (lower_char < upper_char))); 1024 } 1025