1 //===----------------------------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "regex" 10 #include "algorithm" 11 #include "iterator" 12 13 _LIBCPP_BEGIN_NAMESPACE_STD 14 15 static 16 const char* 17 make_error_type_string(regex_constants::error_type ecode) 18 { 19 switch (ecode) 20 { 21 case regex_constants::error_collate: 22 return "The expression contained an invalid collating element name."; 23 case regex_constants::error_ctype: 24 return "The expression contained an invalid character class name."; 25 case regex_constants::error_escape: 26 return "The expression contained an invalid escaped character, or a " 27 "trailing escape."; 28 case regex_constants::error_backref: 29 return "The expression contained an invalid back reference."; 30 case regex_constants::error_brack: 31 return "The expression contained mismatched [ and ]."; 32 case regex_constants::error_paren: 33 return "The expression contained mismatched ( and )."; 34 case regex_constants::error_brace: 35 return "The expression contained mismatched { and }."; 36 case regex_constants::error_badbrace: 37 return "The expression contained an invalid range in a {} expression."; 38 case regex_constants::error_range: 39 return "The expression contained an invalid character range, " 40 "such as [b-a] in most encodings."; 41 case regex_constants::error_space: 42 return "There was insufficient memory to convert the expression into " 43 "a finite state machine."; 44 case regex_constants::error_badrepeat: 45 return "One of *?+{ was not preceded by a valid regular expression."; 46 case regex_constants::error_complexity: 47 return "The complexity of an attempted match against a regular " 48 "expression exceeded a pre-set level."; 49 case regex_constants::error_stack: 50 return "There was insufficient memory to determine whether the regular " 51 "expression could match the specified character sequence."; 52 case regex_constants::__re_err_grammar: 53 return "An invalid regex grammar has been requested."; 54 case regex_constants::__re_err_empty: 55 return "An empty regex is not allowed in the POSIX grammar."; 56 case regex_constants::__re_err_parse: 57 return "The parser did not consume the entire regular expression."; 58 default: 59 break; 60 } 61 return "Unknown error type"; 62 } 63 64 regex_error::regex_error(regex_constants::error_type ecode) 65 : runtime_error(make_error_type_string(ecode)), 66 __code_(ecode) 67 {} 68 69 regex_error::~regex_error() throw() {} 70 71 namespace { 72 73 struct collationnames 74 { 75 const char* elem_; 76 char char_; 77 }; 78 79 #if defined(__MVS__) && !defined(__NATIVE_ASCII_F) 80 // EBCDIC IBM-1047 81 // Sorted via the EBCDIC collating sequence 82 const collationnames collatenames[] = 83 { 84 {"a", 0x81}, 85 {"alert", 0x2f}, 86 {"ampersand", 0x50}, 87 {"apostrophe", 0x7d}, 88 {"asterisk", 0x5c}, 89 {"b", 0x82}, 90 {"backslash", 0xe0}, 91 {"backspace", 0x16}, 92 {"c", 0x83}, 93 {"carriage-return", 0xd}, 94 {"circumflex", 0x5f}, 95 {"circumflex-accent", 0x5f}, 96 {"colon", 0x7a}, 97 {"comma", 0x6b}, 98 {"commercial-at", 0x7c}, 99 {"d", 0x84}, 100 {"dollar-sign", 0x5b}, 101 {"e", 0x85}, 102 {"eight", 0xf8}, 103 {"equals-sign", 0x7e}, 104 {"exclamation-mark", 0x5a}, 105 {"f", 0x86}, 106 {"five", 0xf5}, 107 {"form-feed", 0xc}, 108 {"four", 0xf4}, 109 {"full-stop", 0x4b}, 110 {"g", 0x87}, 111 {"grave-accent", 0x79}, 112 {"greater-than-sign", 0x6e}, 113 {"h", 0x88}, 114 {"hyphen", 0x60}, 115 {"hyphen-minus", 0x60}, 116 {"i", 0x89}, 117 {"j", 0x91}, 118 {"k", 0x92}, 119 {"l", 0x93}, 120 {"left-brace", 0xc0}, 121 {"left-curly-bracket", 0xc0}, 122 {"left-parenthesis", 0x4d}, 123 {"left-square-bracket", 0xad}, 124 {"less-than-sign", 0x4c}, 125 {"low-line", 0x6d}, 126 {"m", 0x94}, 127 {"n", 0x95}, 128 {"newline", 0x15}, 129 {"nine", 0xf9}, 130 {"number-sign", 0x7b}, 131 {"o", 0x96}, 132 {"one", 0xf1}, 133 {"p", 0x97}, 134 {"percent-sign", 0x6c}, 135 {"period", 0x4b}, 136 {"plus-sign", 0x4e}, 137 {"q", 0x98}, 138 {"question-mark", 0x6f}, 139 {"quotation-mark", 0x7f}, 140 {"r", 0x99}, 141 {"reverse-solidus", 0xe0}, 142 {"right-brace", 0xd0}, 143 {"right-curly-bracket", 0xd0}, 144 {"right-parenthesis", 0x5d}, 145 {"right-square-bracket", 0xbd}, 146 {"s", 0xa2}, 147 {"semicolon", 0x5e}, 148 {"seven", 0xf7}, 149 {"six", 0xf6}, 150 {"slash", 0x61}, 151 {"solidus", 0x61}, 152 {"space", 0x40}, 153 {"t", 0xa3}, 154 {"tab", 0x5}, 155 {"three", 0xf3}, 156 {"tilde", 0xa1}, 157 {"two", 0xf2}, 158 {"u", 0xa4}, 159 {"underscore", 0x6d}, 160 {"v", 0xa5}, 161 {"vertical-line", 0x4f}, 162 {"vertical-tab", 0xb}, 163 {"w", 0xa6}, 164 {"x", 0xa7}, 165 {"y", 0xa8}, 166 {"z", 0xa9}, 167 {"zero", 0xf0}, 168 {"A", 0xc1}, 169 {"B", 0xc2}, 170 {"C", 0xc3}, 171 {"D", 0xc4}, 172 {"E", 0xc5}, 173 {"F", 0xc6}, 174 {"G", 0xc7}, 175 {"H", 0xc8}, 176 {"I", 0xc9}, 177 {"J", 0xd1}, 178 {"K", 0xd2}, 179 {"L", 0xd3}, 180 {"M", 0xd4}, 181 {"N", 0xd5}, 182 {"NUL", 0}, 183 {"O", 0xd6}, 184 {"P", 0xd7}, 185 {"Q", 0xd8}, 186 {"R", 0xd9}, 187 {"S", 0xe2}, 188 {"T", 0xe3}, 189 {"U", 0xe4}, 190 {"V", 0xe5}, 191 {"W", 0xe6}, 192 {"X", 0xe7}, 193 {"Y", 0xe8}, 194 {"Z", 0xe9} 195 }; 196 #else 197 // ASCII 198 const collationnames collatenames[] = 199 { 200 {"A", 0x41}, 201 {"B", 0x42}, 202 {"C", 0x43}, 203 {"D", 0x44}, 204 {"E", 0x45}, 205 {"F", 0x46}, 206 {"G", 0x47}, 207 {"H", 0x48}, 208 {"I", 0x49}, 209 {"J", 0x4a}, 210 {"K", 0x4b}, 211 {"L", 0x4c}, 212 {"M", 0x4d}, 213 {"N", 0x4e}, 214 {"NUL", 0x00}, 215 {"O", 0x4f}, 216 {"P", 0x50}, 217 {"Q", 0x51}, 218 {"R", 0x52}, 219 {"S", 0x53}, 220 {"T", 0x54}, 221 {"U", 0x55}, 222 {"V", 0x56}, 223 {"W", 0x57}, 224 {"X", 0x58}, 225 {"Y", 0x59}, 226 {"Z", 0x5a}, 227 {"a", 0x61}, 228 {"alert", 0x07}, 229 {"ampersand", 0x26}, 230 {"apostrophe", 0x27}, 231 {"asterisk", 0x2a}, 232 {"b", 0x62}, 233 {"backslash", 0x5c}, 234 {"backspace", 0x08}, 235 {"c", 0x63}, 236 {"carriage-return", 0x0d}, 237 {"circumflex", 0x5e}, 238 {"circumflex-accent", 0x5e}, 239 {"colon", 0x3a}, 240 {"comma", 0x2c}, 241 {"commercial-at", 0x40}, 242 {"d", 0x64}, 243 {"dollar-sign", 0x24}, 244 {"e", 0x65}, 245 {"eight", 0x38}, 246 {"equals-sign", 0x3d}, 247 {"exclamation-mark", 0x21}, 248 {"f", 0x66}, 249 {"five", 0x35}, 250 {"form-feed", 0x0c}, 251 {"four", 0x34}, 252 {"full-stop", 0x2e}, 253 {"g", 0x67}, 254 {"grave-accent", 0x60}, 255 {"greater-than-sign", 0x3e}, 256 {"h", 0x68}, 257 {"hyphen", 0x2d}, 258 {"hyphen-minus", 0x2d}, 259 {"i", 0x69}, 260 {"j", 0x6a}, 261 {"k", 0x6b}, 262 {"l", 0x6c}, 263 {"left-brace", 0x7b}, 264 {"left-curly-bracket", 0x7b}, 265 {"left-parenthesis", 0x28}, 266 {"left-square-bracket", 0x5b}, 267 {"less-than-sign", 0x3c}, 268 {"low-line", 0x5f}, 269 {"m", 0x6d}, 270 {"n", 0x6e}, 271 {"newline", 0x0a}, 272 {"nine", 0x39}, 273 {"number-sign", 0x23}, 274 {"o", 0x6f}, 275 {"one", 0x31}, 276 {"p", 0x70}, 277 {"percent-sign", 0x25}, 278 {"period", 0x2e}, 279 {"plus-sign", 0x2b}, 280 {"q", 0x71}, 281 {"question-mark", 0x3f}, 282 {"quotation-mark", 0x22}, 283 {"r", 0x72}, 284 {"reverse-solidus", 0x5c}, 285 {"right-brace", 0x7d}, 286 {"right-curly-bracket", 0x7d}, 287 {"right-parenthesis", 0x29}, 288 {"right-square-bracket", 0x5d}, 289 {"s", 0x73}, 290 {"semicolon", 0x3b}, 291 {"seven", 0x37}, 292 {"six", 0x36}, 293 {"slash", 0x2f}, 294 {"solidus", 0x2f}, 295 {"space", 0x20}, 296 {"t", 0x74}, 297 {"tab", 0x09}, 298 {"three", 0x33}, 299 {"tilde", 0x7e}, 300 {"two", 0x32}, 301 {"u", 0x75}, 302 {"underscore", 0x5f}, 303 {"v", 0x76}, 304 {"vertical-line", 0x7c}, 305 {"vertical-tab", 0x0b}, 306 {"w", 0x77}, 307 {"x", 0x78}, 308 {"y", 0x79}, 309 {"z", 0x7a}, 310 {"zero", 0x30} 311 }; 312 #endif 313 314 struct classnames 315 { 316 const char* elem_; 317 regex_traits<char>::char_class_type mask_; 318 }; 319 320 const classnames ClassNames[] = 321 { 322 {"alnum", ctype_base::alnum}, 323 {"alpha", ctype_base::alpha}, 324 {"blank", ctype_base::blank}, 325 {"cntrl", ctype_base::cntrl}, 326 {"d", ctype_base::digit}, 327 {"digit", ctype_base::digit}, 328 {"graph", ctype_base::graph}, 329 {"lower", ctype_base::lower}, 330 {"print", ctype_base::print}, 331 {"punct", ctype_base::punct}, 332 {"s", ctype_base::space}, 333 {"space", ctype_base::space}, 334 {"upper", ctype_base::upper}, 335 {"w", regex_traits<char>::__regex_word}, 336 {"xdigit", ctype_base::xdigit} 337 }; 338 339 struct use_strcmp 340 { 341 bool operator()(const collationnames& x, const char* y) 342 {return strcmp(x.elem_, y) < 0;} 343 bool operator()(const classnames& x, const char* y) 344 {return strcmp(x.elem_, y) < 0;} 345 }; 346 347 } 348 349 string 350 __get_collation_name(const char* s) 351 { 352 const collationnames* i = 353 _VSTD::lower_bound(begin(collatenames), end(collatenames), s, use_strcmp()); 354 string r; 355 if (i != end(collatenames) && strcmp(s, i->elem_) == 0) 356 r = char(i->char_); 357 return r; 358 } 359 360 regex_traits<char>::char_class_type 361 __get_classname(const char* s, bool __icase) 362 { 363 const classnames* i = 364 _VSTD::lower_bound(begin(ClassNames), end(ClassNames), s, use_strcmp()); 365 regex_traits<char>::char_class_type r = 0; 366 if (i != end(ClassNames) && strcmp(s, i->elem_) == 0) 367 { 368 r = i->mask_; 369 if (r == regex_traits<char>::__regex_word) 370 r |= ctype_base::alnum | ctype_base::upper | ctype_base::lower; 371 else if (__icase) 372 { 373 if (r & (ctype_base::lower | ctype_base::upper)) 374 r |= ctype_base::alpha; 375 } 376 } 377 return r; 378 } 379 380 template <> 381 void 382 __match_any_but_newline<char>::__exec(__state& __s) const 383 { 384 if (__s.__current_ != __s.__last_) 385 { 386 switch (*__s.__current_) 387 { 388 case '\r': 389 case '\n': 390 __s.__do_ = __state::__reject; 391 __s.__node_ = nullptr; 392 break; 393 default: 394 __s.__do_ = __state::__accept_and_consume; 395 ++__s.__current_; 396 __s.__node_ = this->first(); 397 break; 398 } 399 } 400 else 401 { 402 __s.__do_ = __state::__reject; 403 __s.__node_ = nullptr; 404 } 405 } 406 407 template <> 408 void 409 __match_any_but_newline<wchar_t>::__exec(__state& __s) const 410 { 411 if (__s.__current_ != __s.__last_) 412 { 413 switch (*__s.__current_) 414 { 415 case '\r': 416 case '\n': 417 case 0x2028: 418 case 0x2029: 419 __s.__do_ = __state::__reject; 420 __s.__node_ = nullptr; 421 break; 422 default: 423 __s.__do_ = __state::__accept_and_consume; 424 ++__s.__current_; 425 __s.__node_ = this->first(); 426 break; 427 } 428 } 429 else 430 { 431 __s.__do_ = __state::__reject; 432 __s.__node_ = nullptr; 433 } 434 } 435 436 _LIBCPP_END_NAMESPACE_STD 437