1*0b57cec5SDimitry Andric //===-------------------------- regex.cpp ---------------------------------===// 2*0b57cec5SDimitry Andric // 3*0b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*0b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5*0b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*0b57cec5SDimitry Andric // 7*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 8*0b57cec5SDimitry Andric 9*0b57cec5SDimitry Andric #include "regex" 10*0b57cec5SDimitry Andric #include "algorithm" 11*0b57cec5SDimitry Andric #include "iterator" 12*0b57cec5SDimitry Andric 13*0b57cec5SDimitry Andric _LIBCPP_BEGIN_NAMESPACE_STD 14*0b57cec5SDimitry Andric 15*0b57cec5SDimitry Andric static 16*0b57cec5SDimitry Andric const char* 17*0b57cec5SDimitry Andric make_error_type_string(regex_constants::error_type ecode) 18*0b57cec5SDimitry Andric { 19*0b57cec5SDimitry Andric switch (ecode) 20*0b57cec5SDimitry Andric { 21*0b57cec5SDimitry Andric case regex_constants::error_collate: 22*0b57cec5SDimitry Andric return "The expression contained an invalid collating element name."; 23*0b57cec5SDimitry Andric case regex_constants::error_ctype: 24*0b57cec5SDimitry Andric return "The expression contained an invalid character class name."; 25*0b57cec5SDimitry Andric case regex_constants::error_escape: 26*0b57cec5SDimitry Andric return "The expression contained an invalid escaped character, or a " 27*0b57cec5SDimitry Andric "trailing escape."; 28*0b57cec5SDimitry Andric case regex_constants::error_backref: 29*0b57cec5SDimitry Andric return "The expression contained an invalid back reference."; 30*0b57cec5SDimitry Andric case regex_constants::error_brack: 31*0b57cec5SDimitry Andric return "The expression contained mismatched [ and ]."; 32*0b57cec5SDimitry Andric case regex_constants::error_paren: 33*0b57cec5SDimitry Andric return "The expression contained mismatched ( and )."; 34*0b57cec5SDimitry Andric case regex_constants::error_brace: 35*0b57cec5SDimitry Andric return "The expression contained mismatched { and }."; 36*0b57cec5SDimitry Andric case regex_constants::error_badbrace: 37*0b57cec5SDimitry Andric return "The expression contained an invalid range in a {} expression."; 38*0b57cec5SDimitry Andric case regex_constants::error_range: 39*0b57cec5SDimitry Andric return "The expression contained an invalid character range, " 40*0b57cec5SDimitry Andric "such as [b-a] in most encodings."; 41*0b57cec5SDimitry Andric case regex_constants::error_space: 42*0b57cec5SDimitry Andric return "There was insufficient memory to convert the expression into " 43*0b57cec5SDimitry Andric "a finite state machine."; 44*0b57cec5SDimitry Andric case regex_constants::error_badrepeat: 45*0b57cec5SDimitry Andric return "One of *?+{ was not preceded by a valid regular expression."; 46*0b57cec5SDimitry Andric case regex_constants::error_complexity: 47*0b57cec5SDimitry Andric return "The complexity of an attempted match against a regular " 48*0b57cec5SDimitry Andric "expression exceeded a pre-set level."; 49*0b57cec5SDimitry Andric case regex_constants::error_stack: 50*0b57cec5SDimitry Andric return "There was insufficient memory to determine whether the regular " 51*0b57cec5SDimitry Andric "expression could match the specified character sequence."; 52*0b57cec5SDimitry Andric case regex_constants::__re_err_grammar: 53*0b57cec5SDimitry Andric return "An invalid regex grammar has been requested."; 54*0b57cec5SDimitry Andric case regex_constants::__re_err_empty: 55*0b57cec5SDimitry Andric return "An empty regex is not allowed in the POSIX grammar."; 56*0b57cec5SDimitry Andric default: 57*0b57cec5SDimitry Andric break; 58*0b57cec5SDimitry Andric } 59*0b57cec5SDimitry Andric return "Unknown error type"; 60*0b57cec5SDimitry Andric } 61*0b57cec5SDimitry Andric 62*0b57cec5SDimitry Andric regex_error::regex_error(regex_constants::error_type ecode) 63*0b57cec5SDimitry Andric : runtime_error(make_error_type_string(ecode)), 64*0b57cec5SDimitry Andric __code_(ecode) 65*0b57cec5SDimitry Andric {} 66*0b57cec5SDimitry Andric 67*0b57cec5SDimitry Andric regex_error::~regex_error() throw() {} 68*0b57cec5SDimitry Andric 69*0b57cec5SDimitry Andric namespace { 70*0b57cec5SDimitry Andric 71*0b57cec5SDimitry Andric struct collationnames 72*0b57cec5SDimitry Andric { 73*0b57cec5SDimitry Andric const char* elem_; 74*0b57cec5SDimitry Andric char char_; 75*0b57cec5SDimitry Andric }; 76*0b57cec5SDimitry Andric 77*0b57cec5SDimitry Andric const collationnames collatenames[] = 78*0b57cec5SDimitry Andric { 79*0b57cec5SDimitry Andric {"A", 0x41}, 80*0b57cec5SDimitry Andric {"B", 0x42}, 81*0b57cec5SDimitry Andric {"C", 0x43}, 82*0b57cec5SDimitry Andric {"D", 0x44}, 83*0b57cec5SDimitry Andric {"E", 0x45}, 84*0b57cec5SDimitry Andric {"F", 0x46}, 85*0b57cec5SDimitry Andric {"G", 0x47}, 86*0b57cec5SDimitry Andric {"H", 0x48}, 87*0b57cec5SDimitry Andric {"I", 0x49}, 88*0b57cec5SDimitry Andric {"J", 0x4a}, 89*0b57cec5SDimitry Andric {"K", 0x4b}, 90*0b57cec5SDimitry Andric {"L", 0x4c}, 91*0b57cec5SDimitry Andric {"M", 0x4d}, 92*0b57cec5SDimitry Andric {"N", 0x4e}, 93*0b57cec5SDimitry Andric {"NUL", 0x00}, 94*0b57cec5SDimitry Andric {"O", 0x4f}, 95*0b57cec5SDimitry Andric {"P", 0x50}, 96*0b57cec5SDimitry Andric {"Q", 0x51}, 97*0b57cec5SDimitry Andric {"R", 0x52}, 98*0b57cec5SDimitry Andric {"S", 0x53}, 99*0b57cec5SDimitry Andric {"T", 0x54}, 100*0b57cec5SDimitry Andric {"U", 0x55}, 101*0b57cec5SDimitry Andric {"V", 0x56}, 102*0b57cec5SDimitry Andric {"W", 0x57}, 103*0b57cec5SDimitry Andric {"X", 0x58}, 104*0b57cec5SDimitry Andric {"Y", 0x59}, 105*0b57cec5SDimitry Andric {"Z", 0x5a}, 106*0b57cec5SDimitry Andric {"a", 0x61}, 107*0b57cec5SDimitry Andric {"alert", 0x07}, 108*0b57cec5SDimitry Andric {"ampersand", 0x26}, 109*0b57cec5SDimitry Andric {"apostrophe", 0x27}, 110*0b57cec5SDimitry Andric {"asterisk", 0x2a}, 111*0b57cec5SDimitry Andric {"b", 0x62}, 112*0b57cec5SDimitry Andric {"backslash", 0x5c}, 113*0b57cec5SDimitry Andric {"backspace", 0x08}, 114*0b57cec5SDimitry Andric {"c", 0x63}, 115*0b57cec5SDimitry Andric {"carriage-return", 0x0d}, 116*0b57cec5SDimitry Andric {"circumflex", 0x5e}, 117*0b57cec5SDimitry Andric {"circumflex-accent", 0x5e}, 118*0b57cec5SDimitry Andric {"colon", 0x3a}, 119*0b57cec5SDimitry Andric {"comma", 0x2c}, 120*0b57cec5SDimitry Andric {"commercial-at", 0x40}, 121*0b57cec5SDimitry Andric {"d", 0x64}, 122*0b57cec5SDimitry Andric {"dollar-sign", 0x24}, 123*0b57cec5SDimitry Andric {"e", 0x65}, 124*0b57cec5SDimitry Andric {"eight", 0x38}, 125*0b57cec5SDimitry Andric {"equals-sign", 0x3d}, 126*0b57cec5SDimitry Andric {"exclamation-mark", 0x21}, 127*0b57cec5SDimitry Andric {"f", 0x66}, 128*0b57cec5SDimitry Andric {"five", 0x35}, 129*0b57cec5SDimitry Andric {"form-feed", 0x0c}, 130*0b57cec5SDimitry Andric {"four", 0x34}, 131*0b57cec5SDimitry Andric {"full-stop", 0x2e}, 132*0b57cec5SDimitry Andric {"g", 0x67}, 133*0b57cec5SDimitry Andric {"grave-accent", 0x60}, 134*0b57cec5SDimitry Andric {"greater-than-sign", 0x3e}, 135*0b57cec5SDimitry Andric {"h", 0x68}, 136*0b57cec5SDimitry Andric {"hyphen", 0x2d}, 137*0b57cec5SDimitry Andric {"hyphen-minus", 0x2d}, 138*0b57cec5SDimitry Andric {"i", 0x69}, 139*0b57cec5SDimitry Andric {"j", 0x6a}, 140*0b57cec5SDimitry Andric {"k", 0x6b}, 141*0b57cec5SDimitry Andric {"l", 0x6c}, 142*0b57cec5SDimitry Andric {"left-brace", 0x7b}, 143*0b57cec5SDimitry Andric {"left-curly-bracket", 0x7b}, 144*0b57cec5SDimitry Andric {"left-parenthesis", 0x28}, 145*0b57cec5SDimitry Andric {"left-square-bracket", 0x5b}, 146*0b57cec5SDimitry Andric {"less-than-sign", 0x3c}, 147*0b57cec5SDimitry Andric {"low-line", 0x5f}, 148*0b57cec5SDimitry Andric {"m", 0x6d}, 149*0b57cec5SDimitry Andric {"n", 0x6e}, 150*0b57cec5SDimitry Andric {"newline", 0x0a}, 151*0b57cec5SDimitry Andric {"nine", 0x39}, 152*0b57cec5SDimitry Andric {"number-sign", 0x23}, 153*0b57cec5SDimitry Andric {"o", 0x6f}, 154*0b57cec5SDimitry Andric {"one", 0x31}, 155*0b57cec5SDimitry Andric {"p", 0x70}, 156*0b57cec5SDimitry Andric {"percent-sign", 0x25}, 157*0b57cec5SDimitry Andric {"period", 0x2e}, 158*0b57cec5SDimitry Andric {"plus-sign", 0x2b}, 159*0b57cec5SDimitry Andric {"q", 0x71}, 160*0b57cec5SDimitry Andric {"question-mark", 0x3f}, 161*0b57cec5SDimitry Andric {"quotation-mark", 0x22}, 162*0b57cec5SDimitry Andric {"r", 0x72}, 163*0b57cec5SDimitry Andric {"reverse-solidus", 0x5c}, 164*0b57cec5SDimitry Andric {"right-brace", 0x7d}, 165*0b57cec5SDimitry Andric {"right-curly-bracket", 0x7d}, 166*0b57cec5SDimitry Andric {"right-parenthesis", 0x29}, 167*0b57cec5SDimitry Andric {"right-square-bracket", 0x5d}, 168*0b57cec5SDimitry Andric {"s", 0x73}, 169*0b57cec5SDimitry Andric {"semicolon", 0x3b}, 170*0b57cec5SDimitry Andric {"seven", 0x37}, 171*0b57cec5SDimitry Andric {"six", 0x36}, 172*0b57cec5SDimitry Andric {"slash", 0x2f}, 173*0b57cec5SDimitry Andric {"solidus", 0x2f}, 174*0b57cec5SDimitry Andric {"space", 0x20}, 175*0b57cec5SDimitry Andric {"t", 0x74}, 176*0b57cec5SDimitry Andric {"tab", 0x09}, 177*0b57cec5SDimitry Andric {"three", 0x33}, 178*0b57cec5SDimitry Andric {"tilde", 0x7e}, 179*0b57cec5SDimitry Andric {"two", 0x32}, 180*0b57cec5SDimitry Andric {"u", 0x75}, 181*0b57cec5SDimitry Andric {"underscore", 0x5f}, 182*0b57cec5SDimitry Andric {"v", 0x76}, 183*0b57cec5SDimitry Andric {"vertical-line", 0x7c}, 184*0b57cec5SDimitry Andric {"vertical-tab", 0x0b}, 185*0b57cec5SDimitry Andric {"w", 0x77}, 186*0b57cec5SDimitry Andric {"x", 0x78}, 187*0b57cec5SDimitry Andric {"y", 0x79}, 188*0b57cec5SDimitry Andric {"z", 0x7a}, 189*0b57cec5SDimitry Andric {"zero", 0x30} 190*0b57cec5SDimitry Andric }; 191*0b57cec5SDimitry Andric 192*0b57cec5SDimitry Andric struct classnames 193*0b57cec5SDimitry Andric { 194*0b57cec5SDimitry Andric const char* elem_; 195*0b57cec5SDimitry Andric regex_traits<char>::char_class_type mask_; 196*0b57cec5SDimitry Andric }; 197*0b57cec5SDimitry Andric 198*0b57cec5SDimitry Andric const classnames ClassNames[] = 199*0b57cec5SDimitry Andric { 200*0b57cec5SDimitry Andric {"alnum", ctype_base::alnum}, 201*0b57cec5SDimitry Andric {"alpha", ctype_base::alpha}, 202*0b57cec5SDimitry Andric {"blank", ctype_base::blank}, 203*0b57cec5SDimitry Andric {"cntrl", ctype_base::cntrl}, 204*0b57cec5SDimitry Andric {"d", ctype_base::digit}, 205*0b57cec5SDimitry Andric {"digit", ctype_base::digit}, 206*0b57cec5SDimitry Andric {"graph", ctype_base::graph}, 207*0b57cec5SDimitry Andric {"lower", ctype_base::lower}, 208*0b57cec5SDimitry Andric {"print", ctype_base::print}, 209*0b57cec5SDimitry Andric {"punct", ctype_base::punct}, 210*0b57cec5SDimitry Andric {"s", ctype_base::space}, 211*0b57cec5SDimitry Andric {"space", ctype_base::space}, 212*0b57cec5SDimitry Andric {"upper", ctype_base::upper}, 213*0b57cec5SDimitry Andric {"w", regex_traits<char>::__regex_word}, 214*0b57cec5SDimitry Andric {"xdigit", ctype_base::xdigit} 215*0b57cec5SDimitry Andric }; 216*0b57cec5SDimitry Andric 217*0b57cec5SDimitry Andric struct use_strcmp 218*0b57cec5SDimitry Andric { 219*0b57cec5SDimitry Andric bool operator()(const collationnames& x, const char* y) 220*0b57cec5SDimitry Andric {return strcmp(x.elem_, y) < 0;} 221*0b57cec5SDimitry Andric bool operator()(const classnames& x, const char* y) 222*0b57cec5SDimitry Andric {return strcmp(x.elem_, y) < 0;} 223*0b57cec5SDimitry Andric }; 224*0b57cec5SDimitry Andric 225*0b57cec5SDimitry Andric } 226*0b57cec5SDimitry Andric 227*0b57cec5SDimitry Andric string 228*0b57cec5SDimitry Andric __get_collation_name(const char* s) 229*0b57cec5SDimitry Andric { 230*0b57cec5SDimitry Andric const collationnames* i = 231*0b57cec5SDimitry Andric _VSTD::lower_bound(begin(collatenames), end(collatenames), s, use_strcmp()); 232*0b57cec5SDimitry Andric string r; 233*0b57cec5SDimitry Andric if (i != end(collatenames) && strcmp(s, i->elem_) == 0) 234*0b57cec5SDimitry Andric r = char(i->char_); 235*0b57cec5SDimitry Andric return r; 236*0b57cec5SDimitry Andric } 237*0b57cec5SDimitry Andric 238*0b57cec5SDimitry Andric regex_traits<char>::char_class_type 239*0b57cec5SDimitry Andric __get_classname(const char* s, bool __icase) 240*0b57cec5SDimitry Andric { 241*0b57cec5SDimitry Andric const classnames* i = 242*0b57cec5SDimitry Andric _VSTD::lower_bound(begin(ClassNames), end(ClassNames), s, use_strcmp()); 243*0b57cec5SDimitry Andric regex_traits<char>::char_class_type r = 0; 244*0b57cec5SDimitry Andric if (i != end(ClassNames) && strcmp(s, i->elem_) == 0) 245*0b57cec5SDimitry Andric { 246*0b57cec5SDimitry Andric r = i->mask_; 247*0b57cec5SDimitry Andric if (r == regex_traits<char>::__regex_word) 248*0b57cec5SDimitry Andric r |= ctype_base::alnum | ctype_base::upper | ctype_base::lower; 249*0b57cec5SDimitry Andric else if (__icase) 250*0b57cec5SDimitry Andric { 251*0b57cec5SDimitry Andric if (r & (ctype_base::lower | ctype_base::upper)) 252*0b57cec5SDimitry Andric r |= ctype_base::alpha; 253*0b57cec5SDimitry Andric } 254*0b57cec5SDimitry Andric } 255*0b57cec5SDimitry Andric return r; 256*0b57cec5SDimitry Andric } 257*0b57cec5SDimitry Andric 258*0b57cec5SDimitry Andric template <> 259*0b57cec5SDimitry Andric void 260*0b57cec5SDimitry Andric __match_any_but_newline<char>::__exec(__state& __s) const 261*0b57cec5SDimitry Andric { 262*0b57cec5SDimitry Andric if (__s.__current_ != __s.__last_) 263*0b57cec5SDimitry Andric { 264*0b57cec5SDimitry Andric switch (*__s.__current_) 265*0b57cec5SDimitry Andric { 266*0b57cec5SDimitry Andric case '\r': 267*0b57cec5SDimitry Andric case '\n': 268*0b57cec5SDimitry Andric __s.__do_ = __state::__reject; 269*0b57cec5SDimitry Andric __s.__node_ = nullptr; 270*0b57cec5SDimitry Andric break; 271*0b57cec5SDimitry Andric default: 272*0b57cec5SDimitry Andric __s.__do_ = __state::__accept_and_consume; 273*0b57cec5SDimitry Andric ++__s.__current_; 274*0b57cec5SDimitry Andric __s.__node_ = this->first(); 275*0b57cec5SDimitry Andric break; 276*0b57cec5SDimitry Andric } 277*0b57cec5SDimitry Andric } 278*0b57cec5SDimitry Andric else 279*0b57cec5SDimitry Andric { 280*0b57cec5SDimitry Andric __s.__do_ = __state::__reject; 281*0b57cec5SDimitry Andric __s.__node_ = nullptr; 282*0b57cec5SDimitry Andric } 283*0b57cec5SDimitry Andric } 284*0b57cec5SDimitry Andric 285*0b57cec5SDimitry Andric template <> 286*0b57cec5SDimitry Andric void 287*0b57cec5SDimitry Andric __match_any_but_newline<wchar_t>::__exec(__state& __s) const 288*0b57cec5SDimitry Andric { 289*0b57cec5SDimitry Andric if (__s.__current_ != __s.__last_) 290*0b57cec5SDimitry Andric { 291*0b57cec5SDimitry Andric switch (*__s.__current_) 292*0b57cec5SDimitry Andric { 293*0b57cec5SDimitry Andric case '\r': 294*0b57cec5SDimitry Andric case '\n': 295*0b57cec5SDimitry Andric case 0x2028: 296*0b57cec5SDimitry Andric case 0x2029: 297*0b57cec5SDimitry Andric __s.__do_ = __state::__reject; 298*0b57cec5SDimitry Andric __s.__node_ = nullptr; 299*0b57cec5SDimitry Andric break; 300*0b57cec5SDimitry Andric default: 301*0b57cec5SDimitry Andric __s.__do_ = __state::__accept_and_consume; 302*0b57cec5SDimitry Andric ++__s.__current_; 303*0b57cec5SDimitry Andric __s.__node_ = this->first(); 304*0b57cec5SDimitry Andric break; 305*0b57cec5SDimitry Andric } 306*0b57cec5SDimitry Andric } 307*0b57cec5SDimitry Andric else 308*0b57cec5SDimitry Andric { 309*0b57cec5SDimitry Andric __s.__do_ = __state::__reject; 310*0b57cec5SDimitry Andric __s.__node_ = nullptr; 311*0b57cec5SDimitry Andric } 312*0b57cec5SDimitry Andric } 313*0b57cec5SDimitry Andric 314*0b57cec5SDimitry Andric _LIBCPP_END_NAMESPACE_STD 315