/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ #pragma ident "%Z%%M% %I% %E% SMI" /* * IMPORTANT NOTE: * * regcmp() WORKS **ONLY** WITH THE ASCII AND THE Solaris EUC CHARACTER SETS. * IT IS **NOT** CHARACTER SET INDEPENDENT. * */ #pragma weak regcmp = _regcmp #include "synonyms.h" #include "mtlib.h" #include #include #include #include #include #include #include #include "tsd.h" /* CONSTANTS SHARED WITH regex() */ #include "regex.h" /* PRIVATE CONSTANTS */ #define BACKSLASH '\\' #define CIRCUMFLEX '^' #define COMMA ',' #define DASH '-' #define DOLLAR_SIGN '$' #define DOT '.' #define LEFT_CURLY_BRACE '{' #define LEFT_PAREN '(' #define LEFT_SQUARE_BRACKET '[' #define PLUS '+' #define RIGHT_CURLY_BRACE '}' #define RIGHT_PAREN ')' #define RIGHT_SQUARE_BRACKET ']' #define SINGLE_BYTE_MASK 0xff #define STRINGP_STACK_SIZE 50 #define STAR '*' /* PRIVATE GLOBAL VARIABLES */ static char *compilep_stack[STRINGP_STACK_SIZE]; static char **compilep_stackp; static mutex_t regcmp_lock = DEFAULTMUTEX; /* DECLARATIONS OF PRIVATE FUNCTIONS */ static int add_char(char *compilep, wchar_t wchar); static int add_single_char_expr(char *compilep, wchar_t wchar); #define ERROR_EXIT(mutex_lockp, arg_listp, compile_startp) \ \ va_end(arg_listp); \ lmutex_unlock(mutex_lockp); \ if ((compile_startp) != (char *)0) \ free((void *)compile_startp); \ return ((char *)0) static int get_count(int *countp, const char *regexp); static int get_digit(const char *regexp); static int get_wchar(wchar_t *wchar, const char *regexp); static char *pop_compilep(void); static char *push_compilep(char *compilep); static boolean_t valid_range(wchar_t lower_char, wchar_t upper_char); /* DEFINITIONS OF PUBLIC VARIABLES */ int __i_size; /* * define thread-specific storage for __i_size * */ int * ___i_size(void) { if (_thr_main()) return (&__i_size); return ((int *)tsdalloc(_T_REGCMP_ISIZE, sizeof (int), NULL)); } #define __i_size (*(___i_size())) /* DEFINITION OF regcmp() */ extern char * regcmp(const char *regexp, ...) { va_list arg_listp; size_t arg_strlen; boolean_t can_repeat; int char_size; unsigned int class_length; char *compilep; char *compile_startp = (char *)0; int count_length; wchar_t current_char; int expr_length; int groupn; unsigned int group_length; unsigned int high_bits; boolean_t dash_indicates_range; unsigned int low_bits; int max_count; int min_count; const char *next_argp; wchar_t first_char_in_range; char *regex_typep; int return_arg_number; int substringn; if (___i_size() == (int *)0) return ((char *)0); /* * When compiling a regular expression, regcmp() generates at most * two extra single-byte characters for each character in the * expression, so allocating three times the number of bytes in all * the strings that comprise the regular expression will ensure that * regcmp() won't overwrite the end of the allocated block when * compiling the expression. */ va_start(arg_listp, regexp); next_argp = regexp; arg_strlen = 0; while (next_argp != (char *)0) { arg_strlen += strlen(next_argp); next_argp = va_arg(arg_listp, /* const */ char *); } va_end(arg_listp); if (arg_strlen == 0) return ((char *)0); compile_startp = (char *)malloc(3 * arg_strlen); if (compile_startp == (char *)0) return ((char *)0); lmutex_lock(®cmp_lock); __i_size = 0; compilep = compile_startp; compilep_stackp = &compilep_stack[STRINGP_STACK_SIZE]; /* GET THE FIRST CHARACTER IN THE REGULAR EXPRESSION */ va_start(arg_listp, regexp); next_argp = va_arg(arg_listp, /* const */ char *); char_size = get_wchar(¤t_char, regexp); if (char_size < 0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else if (char_size > 0) { regexp += char_size; } else /* (char_size == 0 ) */ { regexp = next_argp; next_argp = va_arg(arg_listp, /* const */ char *); char_size = get_wchar(¤t_char, regexp); if (char_size <= 0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else { regexp += char_size; } } /* FIND OUT IF THE EXPRESSION MUST START AT THE START OF A STRING */ if (current_char == CIRCUMFLEX) { char_size = get_wchar(¤t_char, regexp); if (char_size < 0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else if (char_size > 0) { regexp += char_size; *compilep = (unsigned char)START_OF_STRING_MARK; compilep++; } else if /* (char_size == 0) && */ (next_argp != (char *)0) { regexp = next_argp; next_argp = va_arg(arg_listp, /* const */ char *); char_size = get_wchar(¤t_char, regexp); if (char_size <= 0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else { regexp += char_size; } *compilep = (unsigned char)START_OF_STRING_MARK; compilep++; } else { /* ((char_size==0) && (next_argp==(char *)0)) */ /* * the regular expression is "^" */ *compilep = (unsigned char)START_OF_STRING_MARK; compilep++; *compilep = (unsigned char)END_REGEX; compilep++; *compilep = '\0'; compilep++; __i_size = (int)(compilep - compile_startp); va_end(arg_listp); lmutex_unlock(®cmp_lock); return (compile_startp); } } /* COMPILE THE REGULAR EXPRESSION */ groupn = 0; substringn = 0; can_repeat = B_FALSE; for (;;) { /* * At the end of each iteration get the next character * from the regular expression and increment regexp to * point to the following character. Exit when all * the characters in all the strings in the argument * list have been read. */ switch (current_char) { /* * No fall-through. Each case ends with either * a break or an error exit. Each case starts * with compilep addressing the next location to * be written in the compiled regular expression, * and with regexp addressing the next character * to be read from the regular expression being * compiled. Each case that doesn't return * increments regexp to address the next character * to be read from the regular expression and * increments compilep to address the next * location to be written in the compiled * regular expression. * * NOTE: The comments for each case give the meaning * of the regular expression compiled by the case * and the character string written to the compiled * regular expression by the case. Each single * character * written to the compiled regular expression is * shown enclosed in angle brackets (<>). Each * compiled regular expression begins with a marker * character which is shown as a named constant * (e.g. ). Character constants are * shown enclosed in single quotes (e.g. <'$'>). * All other single characters written to the * compiled regular expression are shown as lower * case variable names (e.g. or * ). Multicharacter * strings written to the compiled regular expression * are shown as variable names followed by elipses * (e.g. ). */ case DOLLAR_SIGN: /* end of string marker or simple dollar sign */ /* compiles to or */ /* <'$'> */ char_size = get_wchar(¤t_char, regexp); if ((char_size == 0) && (next_argp == (char *)0)) { can_repeat = B_FALSE; *compilep = (unsigned char)END_OF_STRING_MARK; compilep++; } else { can_repeat = B_TRUE; *compilep = (unsigned char)ASCII_CHAR; regex_typep = compilep; compilep++; *compilep = DOLLAR_SIGN; compilep++; } break; /* end case DOLLAR_SIGN */ case DOT: /* any character */ /* compiles to */ can_repeat = B_TRUE; *compilep = (unsigned char)ANY_CHAR; regex_typep = compilep; compilep++; break; /* end case DOT */ case BACKSLASH: /* escaped character */ /* * compiles to or * */ char_size = get_wchar(¤t_char, regexp); if (char_size <= 0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else { regexp += char_size; can_repeat = B_TRUE; expr_length = add_single_char_expr( compilep, current_char); regex_typep = compilep; compilep += expr_length; } break; /* end case '\\' */ case LEFT_SQUARE_BRACKET: /* start of a character class expression */ /* * [^...c...] compiles to * <...c...> * [^...a-z...] compiles to * <...az...> * [...c...] compiles to * <...c...> * [...a-z...] compiles to * <...az...> * * NOTE: includes the * byte */ can_repeat = B_TRUE; regex_typep = compilep; /* DETERMINE THE CLASS TYPE */ /* * NOTE: This algorithm checks the value of the * "multibyte" * macro in (included in ) * to find out if regcmp() * is compiling the regular expression in a * multibyte locale. */ char_size = get_wchar(¤t_char, regexp); if (char_size <= 0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else if (current_char == CIRCUMFLEX) { regexp++; char_size = get_wchar(¤t_char, regexp); if (char_size <= 0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else { regexp += char_size; if (!multibyte) { *compilep = (unsigned char) NOT_IN_ASCII_CHAR_CLASS; } else { *compilep = (unsigned char) NOT_IN_MULTIBYTE_CHAR_CLASS; } /* leave space for */ compilep += 2; } } else { regexp += char_size; if (!multibyte) { *compilep = (unsigned char) IN_ASCII_CHAR_CLASS; } else { *compilep = (unsigned char) IN_MULTIBYTE_CHAR_CLASS; } /* leave space for */ compilep += 2; } /* COMPILE THE CLASS */ /* * check for a leading right square bracket, * which is allowed */ if (current_char == RIGHT_SQUARE_BRACKET) { /* * the leading RIGHT_SQUARE_BRACKET may * be part of a character range * expression like "[]-\]" */ dash_indicates_range = B_TRUE; first_char_in_range = current_char; char_size = get_wchar(¤t_char, regexp); if (char_size <= 0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else { regexp += char_size; *compilep = RIGHT_SQUARE_BRACKET; compilep++; } } else { /* * decode the character in the following * while loop and decide then if it can * be the first character * in a character range expression */ dash_indicates_range = B_FALSE; } while (current_char != RIGHT_SQUARE_BRACKET) { if (current_char != DASH) { /* * if a DASH follows current_char, * current_char, the DASH and the * character that follows the DASH * may form a character range * expression */ dash_indicates_range = B_TRUE; first_char_in_range = current_char; expr_length = add_char( compilep, current_char); compilep += expr_length; } else if /* (current_char == DASH) && */ (dash_indicates_range == B_FALSE) { /* * current_char is a DASH, but * either begins the entire * character class or follows a * character that's already * part of a character range * expression, so it simply * represents the DASH character * itself */ *compilep = DASH; compilep ++; /* * if another DASH follows this * one, this DASH is part * of a character range expression * like "[--\]" */ dash_indicates_range = B_TRUE; first_char_in_range = current_char; } else /* ((current_char == DASH && */ /* (dash_indicates_range == B_TRUE)) */ { /* * the DASH appears after a single * character that isn't * already part of a character * range expression, so it * and the characters preceding * and following it can form a * character range expression * like "[a-z]" */ char_size = get_wchar( ¤t_char, regexp); if (char_size <= 0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else if (current_char == RIGHT_SQUARE_BRACKET) { /* * the preceding DASH is * the last character in the * class and represents the * DASH character itself */ *compilep = DASH; compilep++; } else if (valid_range( first_char_in_range, current_char) == B_FALSE) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else { /* * the DASH is part of a * character range * expression; encode the * rest of the expression */ regexp += char_size; *compilep = (unsigned char) THRU; compilep++; expr_length = add_char( compilep, current_char); compilep += expr_length; /* * if a DASH follows this * character range * expression, * it represents the DASH * character itself */ dash_indicates_range = B_FALSE; } } /* GET THE NEXT CHARACTER */ char_size = get_wchar(¤t_char, regexp); if (char_size <= 0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else { regexp += char_size; } } /* end while (current_char != RIGHT_SQUARE_BRACKET) */ /* INSERT THE LENGTH OF THE CLASS INTO THE */ /* COMPILED EXPRESSION */ class_length = (unsigned int) (compilep - regex_typep - 1); if ((class_length < 2) || (class_length > MAX_SINGLE_BYTE_INT)) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else { *(regex_typep + 1) = (unsigned char) class_length; } break; /* end case LEFT_SQUARE_BRACKET */ case LEFT_PAREN: /* * start of a parenthesized group of regular * expressions compiles to <'\0'><'\0'>, leaving * space in the compiled regular expression for * */ if (push_compilep(compilep) == (char *)0) { /* * groups can contain groups, so group * start pointers * must be saved and restored in sequence */ ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else { can_repeat = B_FALSE; *compilep = '\0'; /* for debugging */ compilep++; *compilep = '\0'; /* for debugging */ compilep++; } break; /* end case LEFT_PAREN */ case RIGHT_PAREN: /* end of a marked group of regular expressions */ /* * ()$0-9 compiles to * \ * * ()* compiles to * * * * ()+ compiles to * * \ * * * (){...} compiles to * \ * \ * * otherwise () compiles to * * * * NOTE: * * group_length + (256 * ADDED_LENGTH_BITS) == * length_of( * ) * which also == * length_of( * \ ) * groupn no longer seems to be used, but the code * still computes it to preserve backward * compatibility * with earlier versions of regex(). */ /* RETRIEVE THE ADDRESS OF THE START OF THE GROUP */ regex_typep = pop_compilep(); if (regex_typep == (char *)0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } char_size = get_wchar(¤t_char, regexp); if (char_size < 0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else if (char_size == 0) { *regex_typep = SIMPLE_GROUP; can_repeat = B_TRUE; *compilep = (unsigned char)END_GROUP; regex_typep = compilep; compilep++; *compilep = (unsigned char)groupn; groupn++; compilep++; } else if (current_char == DOLLAR_SIGN) { *regex_typep = SAVED_GROUP; regex_typep++; *regex_typep = (char)substringn; can_repeat = B_FALSE; regexp ++; return_arg_number = get_digit(regexp); if ((return_arg_number < 0) || (substringn >= NSUBSTRINGS)) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } regexp++; *compilep = (unsigned char)END_SAVED_GROUP; compilep++; *compilep = (unsigned char)substringn; substringn++; compilep++; *compilep = (unsigned char)return_arg_number; compilep++; } else { switch (current_char) { case STAR: *regex_typep = ZERO_OR_MORE_GROUP; break; case PLUS: *regex_typep = ONE_OR_MORE_GROUP; break; case LEFT_CURLY_BRACE: *regex_typep = COUNTED_GROUP; break; default: *regex_typep = SIMPLE_GROUP; } if (*regex_typep != SIMPLE_GROUP) { group_length = (unsigned int) (compilep - regex_typep); if (group_length >= 1024) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } high_bits = group_length >> TIMES_256_SHIFT; low_bits = group_length & SINGLE_BYTE_MASK; *regex_typep = (unsigned char) ((unsigned int) *regex_typep | high_bits); regex_typep++; *regex_typep = (unsigned char)low_bits; } can_repeat = B_TRUE; *compilep = (unsigned char)END_GROUP; regex_typep = compilep; compilep++; *compilep = (unsigned char)groupn; groupn++; compilep++; } break; /* end case RIGHT_PAREN */ case STAR: /* zero or more repetitions of the */ /* preceding expression */ /* * * compiles to \ * * ()* compiles to * \ * \ * */ if (can_repeat == B_FALSE) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else { can_repeat = B_FALSE; *regex_typep = (unsigned char) ((unsigned int)*regex_typep | ZERO_OR_MORE); } break; /* end case '*' */ case PLUS: /* one or more repetitions of the preceding */ /* expression */ /* * + compiles to \ * ()+ compiles to * \ * \ * */ if (can_repeat == B_FALSE) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else { can_repeat = B_FALSE; *regex_typep = (unsigned char)((unsigned int)* regex_typep | ONE_OR_MORE); } break; /* end case '+' */ case LEFT_CURLY_BRACE: /* * repeat the preceding regular expression * at least min_count times * and at most max_count times * * {min_count} compiles to * * * * {min_count,} compiles to * * * * {min_count,max_count} compiles to * |COUNT> * * * (){min_count,max_count} compiles to * \ * \ * */ if (can_repeat == B_FALSE) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } can_repeat = B_FALSE; *regex_typep = (unsigned char)((unsigned int)* regex_typep | COUNT); count_length = get_count(&min_count, regexp); if (count_length <= 0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } regexp += count_length; if (*regexp == RIGHT_CURLY_BRACE) { /* {min_count} */ regexp++; max_count = min_count; } else if (*regexp == COMMA) { /* {min_count,..} */ regexp++; /* {min_count,} */ if (*regexp == RIGHT_CURLY_BRACE) { regexp++; max_count = UNLIMITED; } else { /* {min_count,max_count} */ count_length = get_count( &max_count, regexp); if (count_length <= 0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } regexp += count_length; if (*regexp != RIGHT_CURLY_BRACE) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } regexp++; } } else { /* invalid expression */ ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } if ((min_count > MAX_SINGLE_BYTE_INT) || ((max_count != UNLIMITED) && (min_count > max_count))) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else { *compilep = (unsigned char)min_count; compilep++; *compilep = (unsigned char)max_count; compilep++; } break; /* end case LEFT_CURLY_BRACE */ default: /* a single non-special character */ /* * compiles to or * */ can_repeat = B_TRUE; regex_typep = compilep; expr_length = add_single_char_expr(compilep, current_char); compilep += expr_length; } /* end switch (current_char) */ /* GET THE NEXT CHARACTER FOR THE WHILE LOOP */ char_size = get_wchar(¤t_char, regexp); if (char_size < 0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else if (char_size > 0) { regexp += char_size; } else if /* (char_size == 0) && */ (next_argp != (char *)0) { regexp = next_argp; next_argp = va_arg(arg_listp, /* const */ char *); char_size = get_wchar(¤t_char, regexp); if (char_size <= 0) { ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } else { regexp += char_size; } } else /* ((char_size == 0) && (next_argp == (char *)0)) */ { if (pop_compilep() != (char *)0) { /* unmatched parentheses */ ERROR_EXIT(®cmp_lock, arg_listp, compile_startp); } *compilep = (unsigned char)END_REGEX; compilep++; *compilep = '\0'; compilep++; __i_size = (int)(compilep - compile_startp); va_end(arg_listp); lmutex_unlock(®cmp_lock); return (compile_startp); } } /* end for (;;) */ } /* regcmp() */ /* DEFINITIONS OF PRIVATE FUNCTIONS */ static int add_char(char *compilep, wchar_t wchar) { int expr_length; if ((unsigned int)wchar <= (unsigned int)0x7f) { *compilep = (unsigned char)wchar; expr_length = 1; } else { expr_length = wctomb(compilep, wchar); } return (expr_length); } static int add_single_char_expr(char *compilep, wchar_t wchar) { int expr_length = 0; if ((unsigned int)wchar <= (unsigned int)0x7f) { *compilep = (unsigned char)ASCII_CHAR; compilep++; *compilep = (unsigned char)wchar; expr_length += 2; } else { *compilep = (unsigned char)MULTIBYTE_CHAR; compilep++; expr_length++; expr_length += wctomb(compilep, wchar); } return (expr_length); } static int get_count(int *countp, const char *regexp) { char count_char = '0'; int count = 0; int count_length = 0; if (regexp == (char *)0) { return ((int)0); } else { count_char = *regexp; while (('0' <= count_char) && (count_char <= '9')) { count = (10 * count) + (int)(count_char - '0'); count_length++; regexp++; count_char = *regexp; } } *countp = count; return (count_length); } static int get_digit(const char *regexp) { char digit; if (regexp == (char *)0) { return ((int)-1); } else { digit = *regexp; if (('0' <= digit) && (digit <= '9')) { return ((int)(digit - '0')); } else { return ((int)-1); } } } static int get_wchar(wchar_t *wcharp, const char *regexp) { int char_size; if (regexp == (char *)0) { char_size = 0; *wcharp = (wchar_t)((unsigned int)'\0'); } else if (*regexp == '\0') { char_size = 0; *wcharp = (wchar_t)((unsigned int)*regexp); } else if ((unsigned char)*regexp <= (unsigned char)0x7f) { char_size = 1; *wcharp = (wchar_t)((unsigned int)*regexp); } else { char_size = mbtowc(wcharp, regexp, MB_LEN_MAX); } return (char_size); } static char * pop_compilep(void) { char *compilep; if (compilep_stackp >= &compilep_stack[STRINGP_STACK_SIZE]) { return ((char *)0); } else { compilep = *compilep_stackp; compilep_stackp++; return (compilep); } } static char * push_compilep(char *compilep) { if (compilep_stackp <= &compilep_stack[0]) { return ((char *)0); } else { compilep_stackp--; *compilep_stackp = compilep; return (compilep); } } static boolean_t valid_range(wchar_t lower_char, wchar_t upper_char) { return (((lower_char <= 0x7f) && (upper_char <= 0x7f) && !iswcntrl(lower_char) && !iswcntrl(upper_char) && (lower_char < upper_char)) || (((lower_char & WCHAR_CSMASK) == (upper_char & WCHAR_CSMASK)) && (lower_char < upper_char))); }