10b57cec5SDimitry Andric //===-------------------------- regex.cpp ---------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric 90b57cec5SDimitry Andric #include "regex" 100b57cec5SDimitry Andric #include "algorithm" 110b57cec5SDimitry Andric #include "iterator" 120b57cec5SDimitry Andric 130b57cec5SDimitry Andric _LIBCPP_BEGIN_NAMESPACE_STD 140b57cec5SDimitry Andric 150b57cec5SDimitry Andric static 160b57cec5SDimitry Andric const char* 170b57cec5SDimitry Andric make_error_type_string(regex_constants::error_type ecode) 180b57cec5SDimitry Andric { 190b57cec5SDimitry Andric switch (ecode) 200b57cec5SDimitry Andric { 210b57cec5SDimitry Andric case regex_constants::error_collate: 220b57cec5SDimitry Andric return "The expression contained an invalid collating element name."; 230b57cec5SDimitry Andric case regex_constants::error_ctype: 240b57cec5SDimitry Andric return "The expression contained an invalid character class name."; 250b57cec5SDimitry Andric case regex_constants::error_escape: 260b57cec5SDimitry Andric return "The expression contained an invalid escaped character, or a " 270b57cec5SDimitry Andric "trailing escape."; 280b57cec5SDimitry Andric case regex_constants::error_backref: 290b57cec5SDimitry Andric return "The expression contained an invalid back reference."; 300b57cec5SDimitry Andric case regex_constants::error_brack: 310b57cec5SDimitry Andric return "The expression contained mismatched [ and ]."; 320b57cec5SDimitry Andric case regex_constants::error_paren: 330b57cec5SDimitry Andric return "The expression contained mismatched ( and )."; 340b57cec5SDimitry Andric case regex_constants::error_brace: 350b57cec5SDimitry Andric return "The expression contained mismatched { and }."; 360b57cec5SDimitry Andric case regex_constants::error_badbrace: 370b57cec5SDimitry Andric return "The expression contained an invalid range in a {} expression."; 380b57cec5SDimitry Andric case regex_constants::error_range: 390b57cec5SDimitry Andric return "The expression contained an invalid character range, " 400b57cec5SDimitry Andric "such as [b-a] in most encodings."; 410b57cec5SDimitry Andric case regex_constants::error_space: 420b57cec5SDimitry Andric return "There was insufficient memory to convert the expression into " 430b57cec5SDimitry Andric "a finite state machine."; 440b57cec5SDimitry Andric case regex_constants::error_badrepeat: 450b57cec5SDimitry Andric return "One of *?+{ was not preceded by a valid regular expression."; 460b57cec5SDimitry Andric case regex_constants::error_complexity: 470b57cec5SDimitry Andric return "The complexity of an attempted match against a regular " 480b57cec5SDimitry Andric "expression exceeded a pre-set level."; 490b57cec5SDimitry Andric case regex_constants::error_stack: 500b57cec5SDimitry Andric return "There was insufficient memory to determine whether the regular " 510b57cec5SDimitry Andric "expression could match the specified character sequence."; 520b57cec5SDimitry Andric case regex_constants::__re_err_grammar: 530b57cec5SDimitry Andric return "An invalid regex grammar has been requested."; 540b57cec5SDimitry Andric case regex_constants::__re_err_empty: 550b57cec5SDimitry Andric return "An empty regex is not allowed in the POSIX grammar."; 56*480093f4SDimitry Andric case regex_constants::__re_err_parse: 57*480093f4SDimitry Andric return "The parser did not consume the entire regular expression."; 580b57cec5SDimitry Andric default: 590b57cec5SDimitry Andric break; 600b57cec5SDimitry Andric } 610b57cec5SDimitry Andric return "Unknown error type"; 620b57cec5SDimitry Andric } 630b57cec5SDimitry Andric 640b57cec5SDimitry Andric regex_error::regex_error(regex_constants::error_type ecode) 650b57cec5SDimitry Andric : runtime_error(make_error_type_string(ecode)), 660b57cec5SDimitry Andric __code_(ecode) 670b57cec5SDimitry Andric {} 680b57cec5SDimitry Andric 690b57cec5SDimitry Andric regex_error::~regex_error() throw() {} 700b57cec5SDimitry Andric 710b57cec5SDimitry Andric namespace { 720b57cec5SDimitry Andric 730b57cec5SDimitry Andric struct collationnames 740b57cec5SDimitry Andric { 750b57cec5SDimitry Andric const char* elem_; 760b57cec5SDimitry Andric char char_; 770b57cec5SDimitry Andric }; 780b57cec5SDimitry Andric 790b57cec5SDimitry Andric const collationnames collatenames[] = 800b57cec5SDimitry Andric { 810b57cec5SDimitry Andric {"A", 0x41}, 820b57cec5SDimitry Andric {"B", 0x42}, 830b57cec5SDimitry Andric {"C", 0x43}, 840b57cec5SDimitry Andric {"D", 0x44}, 850b57cec5SDimitry Andric {"E", 0x45}, 860b57cec5SDimitry Andric {"F", 0x46}, 870b57cec5SDimitry Andric {"G", 0x47}, 880b57cec5SDimitry Andric {"H", 0x48}, 890b57cec5SDimitry Andric {"I", 0x49}, 900b57cec5SDimitry Andric {"J", 0x4a}, 910b57cec5SDimitry Andric {"K", 0x4b}, 920b57cec5SDimitry Andric {"L", 0x4c}, 930b57cec5SDimitry Andric {"M", 0x4d}, 940b57cec5SDimitry Andric {"N", 0x4e}, 950b57cec5SDimitry Andric {"NUL", 0x00}, 960b57cec5SDimitry Andric {"O", 0x4f}, 970b57cec5SDimitry Andric {"P", 0x50}, 980b57cec5SDimitry Andric {"Q", 0x51}, 990b57cec5SDimitry Andric {"R", 0x52}, 1000b57cec5SDimitry Andric {"S", 0x53}, 1010b57cec5SDimitry Andric {"T", 0x54}, 1020b57cec5SDimitry Andric {"U", 0x55}, 1030b57cec5SDimitry Andric {"V", 0x56}, 1040b57cec5SDimitry Andric {"W", 0x57}, 1050b57cec5SDimitry Andric {"X", 0x58}, 1060b57cec5SDimitry Andric {"Y", 0x59}, 1070b57cec5SDimitry Andric {"Z", 0x5a}, 1080b57cec5SDimitry Andric {"a", 0x61}, 1090b57cec5SDimitry Andric {"alert", 0x07}, 1100b57cec5SDimitry Andric {"ampersand", 0x26}, 1110b57cec5SDimitry Andric {"apostrophe", 0x27}, 1120b57cec5SDimitry Andric {"asterisk", 0x2a}, 1130b57cec5SDimitry Andric {"b", 0x62}, 1140b57cec5SDimitry Andric {"backslash", 0x5c}, 1150b57cec5SDimitry Andric {"backspace", 0x08}, 1160b57cec5SDimitry Andric {"c", 0x63}, 1170b57cec5SDimitry Andric {"carriage-return", 0x0d}, 1180b57cec5SDimitry Andric {"circumflex", 0x5e}, 1190b57cec5SDimitry Andric {"circumflex-accent", 0x5e}, 1200b57cec5SDimitry Andric {"colon", 0x3a}, 1210b57cec5SDimitry Andric {"comma", 0x2c}, 1220b57cec5SDimitry Andric {"commercial-at", 0x40}, 1230b57cec5SDimitry Andric {"d", 0x64}, 1240b57cec5SDimitry Andric {"dollar-sign", 0x24}, 1250b57cec5SDimitry Andric {"e", 0x65}, 1260b57cec5SDimitry Andric {"eight", 0x38}, 1270b57cec5SDimitry Andric {"equals-sign", 0x3d}, 1280b57cec5SDimitry Andric {"exclamation-mark", 0x21}, 1290b57cec5SDimitry Andric {"f", 0x66}, 1300b57cec5SDimitry Andric {"five", 0x35}, 1310b57cec5SDimitry Andric {"form-feed", 0x0c}, 1320b57cec5SDimitry Andric {"four", 0x34}, 1330b57cec5SDimitry Andric {"full-stop", 0x2e}, 1340b57cec5SDimitry Andric {"g", 0x67}, 1350b57cec5SDimitry Andric {"grave-accent", 0x60}, 1360b57cec5SDimitry Andric {"greater-than-sign", 0x3e}, 1370b57cec5SDimitry Andric {"h", 0x68}, 1380b57cec5SDimitry Andric {"hyphen", 0x2d}, 1390b57cec5SDimitry Andric {"hyphen-minus", 0x2d}, 1400b57cec5SDimitry Andric {"i", 0x69}, 1410b57cec5SDimitry Andric {"j", 0x6a}, 1420b57cec5SDimitry Andric {"k", 0x6b}, 1430b57cec5SDimitry Andric {"l", 0x6c}, 1440b57cec5SDimitry Andric {"left-brace", 0x7b}, 1450b57cec5SDimitry Andric {"left-curly-bracket", 0x7b}, 1460b57cec5SDimitry Andric {"left-parenthesis", 0x28}, 1470b57cec5SDimitry Andric {"left-square-bracket", 0x5b}, 1480b57cec5SDimitry Andric {"less-than-sign", 0x3c}, 1490b57cec5SDimitry Andric {"low-line", 0x5f}, 1500b57cec5SDimitry Andric {"m", 0x6d}, 1510b57cec5SDimitry Andric {"n", 0x6e}, 1520b57cec5SDimitry Andric {"newline", 0x0a}, 1530b57cec5SDimitry Andric {"nine", 0x39}, 1540b57cec5SDimitry Andric {"number-sign", 0x23}, 1550b57cec5SDimitry Andric {"o", 0x6f}, 1560b57cec5SDimitry Andric {"one", 0x31}, 1570b57cec5SDimitry Andric {"p", 0x70}, 1580b57cec5SDimitry Andric {"percent-sign", 0x25}, 1590b57cec5SDimitry Andric {"period", 0x2e}, 1600b57cec5SDimitry Andric {"plus-sign", 0x2b}, 1610b57cec5SDimitry Andric {"q", 0x71}, 1620b57cec5SDimitry Andric {"question-mark", 0x3f}, 1630b57cec5SDimitry Andric {"quotation-mark", 0x22}, 1640b57cec5SDimitry Andric {"r", 0x72}, 1650b57cec5SDimitry Andric {"reverse-solidus", 0x5c}, 1660b57cec5SDimitry Andric {"right-brace", 0x7d}, 1670b57cec5SDimitry Andric {"right-curly-bracket", 0x7d}, 1680b57cec5SDimitry Andric {"right-parenthesis", 0x29}, 1690b57cec5SDimitry Andric {"right-square-bracket", 0x5d}, 1700b57cec5SDimitry Andric {"s", 0x73}, 1710b57cec5SDimitry Andric {"semicolon", 0x3b}, 1720b57cec5SDimitry Andric {"seven", 0x37}, 1730b57cec5SDimitry Andric {"six", 0x36}, 1740b57cec5SDimitry Andric {"slash", 0x2f}, 1750b57cec5SDimitry Andric {"solidus", 0x2f}, 1760b57cec5SDimitry Andric {"space", 0x20}, 1770b57cec5SDimitry Andric {"t", 0x74}, 1780b57cec5SDimitry Andric {"tab", 0x09}, 1790b57cec5SDimitry Andric {"three", 0x33}, 1800b57cec5SDimitry Andric {"tilde", 0x7e}, 1810b57cec5SDimitry Andric {"two", 0x32}, 1820b57cec5SDimitry Andric {"u", 0x75}, 1830b57cec5SDimitry Andric {"underscore", 0x5f}, 1840b57cec5SDimitry Andric {"v", 0x76}, 1850b57cec5SDimitry Andric {"vertical-line", 0x7c}, 1860b57cec5SDimitry Andric {"vertical-tab", 0x0b}, 1870b57cec5SDimitry Andric {"w", 0x77}, 1880b57cec5SDimitry Andric {"x", 0x78}, 1890b57cec5SDimitry Andric {"y", 0x79}, 1900b57cec5SDimitry Andric {"z", 0x7a}, 1910b57cec5SDimitry Andric {"zero", 0x30} 1920b57cec5SDimitry Andric }; 1930b57cec5SDimitry Andric 1940b57cec5SDimitry Andric struct classnames 1950b57cec5SDimitry Andric { 1960b57cec5SDimitry Andric const char* elem_; 1970b57cec5SDimitry Andric regex_traits<char>::char_class_type mask_; 1980b57cec5SDimitry Andric }; 1990b57cec5SDimitry Andric 2000b57cec5SDimitry Andric const classnames ClassNames[] = 2010b57cec5SDimitry Andric { 2020b57cec5SDimitry Andric {"alnum", ctype_base::alnum}, 2030b57cec5SDimitry Andric {"alpha", ctype_base::alpha}, 2040b57cec5SDimitry Andric {"blank", ctype_base::blank}, 2050b57cec5SDimitry Andric {"cntrl", ctype_base::cntrl}, 2060b57cec5SDimitry Andric {"d", ctype_base::digit}, 2070b57cec5SDimitry Andric {"digit", ctype_base::digit}, 2080b57cec5SDimitry Andric {"graph", ctype_base::graph}, 2090b57cec5SDimitry Andric {"lower", ctype_base::lower}, 2100b57cec5SDimitry Andric {"print", ctype_base::print}, 2110b57cec5SDimitry Andric {"punct", ctype_base::punct}, 2120b57cec5SDimitry Andric {"s", ctype_base::space}, 2130b57cec5SDimitry Andric {"space", ctype_base::space}, 2140b57cec5SDimitry Andric {"upper", ctype_base::upper}, 2150b57cec5SDimitry Andric {"w", regex_traits<char>::__regex_word}, 2160b57cec5SDimitry Andric {"xdigit", ctype_base::xdigit} 2170b57cec5SDimitry Andric }; 2180b57cec5SDimitry Andric 2190b57cec5SDimitry Andric struct use_strcmp 2200b57cec5SDimitry Andric { 2210b57cec5SDimitry Andric bool operator()(const collationnames& x, const char* y) 2220b57cec5SDimitry Andric {return strcmp(x.elem_, y) < 0;} 2230b57cec5SDimitry Andric bool operator()(const classnames& x, const char* y) 2240b57cec5SDimitry Andric {return strcmp(x.elem_, y) < 0;} 2250b57cec5SDimitry Andric }; 2260b57cec5SDimitry Andric 2270b57cec5SDimitry Andric } 2280b57cec5SDimitry Andric 2290b57cec5SDimitry Andric string 2300b57cec5SDimitry Andric __get_collation_name(const char* s) 2310b57cec5SDimitry Andric { 2320b57cec5SDimitry Andric const collationnames* i = 2330b57cec5SDimitry Andric _VSTD::lower_bound(begin(collatenames), end(collatenames), s, use_strcmp()); 2340b57cec5SDimitry Andric string r; 2350b57cec5SDimitry Andric if (i != end(collatenames) && strcmp(s, i->elem_) == 0) 2360b57cec5SDimitry Andric r = char(i->char_); 2370b57cec5SDimitry Andric return r; 2380b57cec5SDimitry Andric } 2390b57cec5SDimitry Andric 2400b57cec5SDimitry Andric regex_traits<char>::char_class_type 2410b57cec5SDimitry Andric __get_classname(const char* s, bool __icase) 2420b57cec5SDimitry Andric { 2430b57cec5SDimitry Andric const classnames* i = 2440b57cec5SDimitry Andric _VSTD::lower_bound(begin(ClassNames), end(ClassNames), s, use_strcmp()); 2450b57cec5SDimitry Andric regex_traits<char>::char_class_type r = 0; 2460b57cec5SDimitry Andric if (i != end(ClassNames) && strcmp(s, i->elem_) == 0) 2470b57cec5SDimitry Andric { 2480b57cec5SDimitry Andric r = i->mask_; 2490b57cec5SDimitry Andric if (r == regex_traits<char>::__regex_word) 2500b57cec5SDimitry Andric r |= ctype_base::alnum | ctype_base::upper | ctype_base::lower; 2510b57cec5SDimitry Andric else if (__icase) 2520b57cec5SDimitry Andric { 2530b57cec5SDimitry Andric if (r & (ctype_base::lower | ctype_base::upper)) 2540b57cec5SDimitry Andric r |= ctype_base::alpha; 2550b57cec5SDimitry Andric } 2560b57cec5SDimitry Andric } 2570b57cec5SDimitry Andric return r; 2580b57cec5SDimitry Andric } 2590b57cec5SDimitry Andric 2600b57cec5SDimitry Andric template <> 2610b57cec5SDimitry Andric void 2620b57cec5SDimitry Andric __match_any_but_newline<char>::__exec(__state& __s) const 2630b57cec5SDimitry Andric { 2640b57cec5SDimitry Andric if (__s.__current_ != __s.__last_) 2650b57cec5SDimitry Andric { 2660b57cec5SDimitry Andric switch (*__s.__current_) 2670b57cec5SDimitry Andric { 2680b57cec5SDimitry Andric case '\r': 2690b57cec5SDimitry Andric case '\n': 2700b57cec5SDimitry Andric __s.__do_ = __state::__reject; 2710b57cec5SDimitry Andric __s.__node_ = nullptr; 2720b57cec5SDimitry Andric break; 2730b57cec5SDimitry Andric default: 2740b57cec5SDimitry Andric __s.__do_ = __state::__accept_and_consume; 2750b57cec5SDimitry Andric ++__s.__current_; 2760b57cec5SDimitry Andric __s.__node_ = this->first(); 2770b57cec5SDimitry Andric break; 2780b57cec5SDimitry Andric } 2790b57cec5SDimitry Andric } 2800b57cec5SDimitry Andric else 2810b57cec5SDimitry Andric { 2820b57cec5SDimitry Andric __s.__do_ = __state::__reject; 2830b57cec5SDimitry Andric __s.__node_ = nullptr; 2840b57cec5SDimitry Andric } 2850b57cec5SDimitry Andric } 2860b57cec5SDimitry Andric 2870b57cec5SDimitry Andric template <> 2880b57cec5SDimitry Andric void 2890b57cec5SDimitry Andric __match_any_but_newline<wchar_t>::__exec(__state& __s) const 2900b57cec5SDimitry Andric { 2910b57cec5SDimitry Andric if (__s.__current_ != __s.__last_) 2920b57cec5SDimitry Andric { 2930b57cec5SDimitry Andric switch (*__s.__current_) 2940b57cec5SDimitry Andric { 2950b57cec5SDimitry Andric case '\r': 2960b57cec5SDimitry Andric case '\n': 2970b57cec5SDimitry Andric case 0x2028: 2980b57cec5SDimitry Andric case 0x2029: 2990b57cec5SDimitry Andric __s.__do_ = __state::__reject; 3000b57cec5SDimitry Andric __s.__node_ = nullptr; 3010b57cec5SDimitry Andric break; 3020b57cec5SDimitry Andric default: 3030b57cec5SDimitry Andric __s.__do_ = __state::__accept_and_consume; 3040b57cec5SDimitry Andric ++__s.__current_; 3050b57cec5SDimitry Andric __s.__node_ = this->first(); 3060b57cec5SDimitry Andric break; 3070b57cec5SDimitry Andric } 3080b57cec5SDimitry Andric } 3090b57cec5SDimitry Andric else 3100b57cec5SDimitry Andric { 3110b57cec5SDimitry Andric __s.__do_ = __state::__reject; 3120b57cec5SDimitry Andric __s.__node_ = nullptr; 3130b57cec5SDimitry Andric } 3140b57cec5SDimitry Andric } 3150b57cec5SDimitry Andric 3160b57cec5SDimitry Andric _LIBCPP_END_NAMESPACE_STD 317