1 //===-------------------------- regex.cpp ---------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "regex" 10 #include "algorithm" 11 #include "iterator" 12 13 _LIBCPP_BEGIN_NAMESPACE_STD 14 15 static 16 const char* 17 make_error_type_string(regex_constants::error_type ecode) 18 { 19 switch (ecode) 20 { 21 case regex_constants::error_collate: 22 return "The expression contained an invalid collating element name."; 23 case regex_constants::error_ctype: 24 return "The expression contained an invalid character class name."; 25 case regex_constants::error_escape: 26 return "The expression contained an invalid escaped character, or a " 27 "trailing escape."; 28 case regex_constants::error_backref: 29 return "The expression contained an invalid back reference."; 30 case regex_constants::error_brack: 31 return "The expression contained mismatched [ and ]."; 32 case regex_constants::error_paren: 33 return "The expression contained mismatched ( and )."; 34 case regex_constants::error_brace: 35 return "The expression contained mismatched { and }."; 36 case regex_constants::error_badbrace: 37 return "The expression contained an invalid range in a {} expression."; 38 case regex_constants::error_range: 39 return "The expression contained an invalid character range, " 40 "such as [b-a] in most encodings."; 41 case regex_constants::error_space: 42 return "There was insufficient memory to convert the expression into " 43 "a finite state machine."; 44 case regex_constants::error_badrepeat: 45 return "One of *?+{ was not preceded by a valid regular expression."; 46 case regex_constants::error_complexity: 47 return "The complexity of an attempted match against a regular " 48 "expression exceeded a pre-set level."; 49 case regex_constants::error_stack: 50 return "There was insufficient memory to determine whether the regular " 51 "expression could match the specified character sequence."; 52 case regex_constants::__re_err_grammar: 53 return "An invalid regex grammar has been requested."; 54 case regex_constants::__re_err_empty: 55 return "An empty regex is not allowed in the POSIX grammar."; 56 case regex_constants::__re_err_parse: 57 return "The parser did not consume the entire regular expression."; 58 default: 59 break; 60 } 61 return "Unknown error type"; 62 } 63 64 regex_error::regex_error(regex_constants::error_type ecode) 65 : runtime_error(make_error_type_string(ecode)), 66 __code_(ecode) 67 {} 68 69 regex_error::~regex_error() throw() {} 70 71 namespace { 72 73 struct collationnames 74 { 75 const char* elem_; 76 char char_; 77 }; 78 79 const collationnames collatenames[] = 80 { 81 {"A", 0x41}, 82 {"B", 0x42}, 83 {"C", 0x43}, 84 {"D", 0x44}, 85 {"E", 0x45}, 86 {"F", 0x46}, 87 {"G", 0x47}, 88 {"H", 0x48}, 89 {"I", 0x49}, 90 {"J", 0x4a}, 91 {"K", 0x4b}, 92 {"L", 0x4c}, 93 {"M", 0x4d}, 94 {"N", 0x4e}, 95 {"NUL", 0x00}, 96 {"O", 0x4f}, 97 {"P", 0x50}, 98 {"Q", 0x51}, 99 {"R", 0x52}, 100 {"S", 0x53}, 101 {"T", 0x54}, 102 {"U", 0x55}, 103 {"V", 0x56}, 104 {"W", 0x57}, 105 {"X", 0x58}, 106 {"Y", 0x59}, 107 {"Z", 0x5a}, 108 {"a", 0x61}, 109 {"alert", 0x07}, 110 {"ampersand", 0x26}, 111 {"apostrophe", 0x27}, 112 {"asterisk", 0x2a}, 113 {"b", 0x62}, 114 {"backslash", 0x5c}, 115 {"backspace", 0x08}, 116 {"c", 0x63}, 117 {"carriage-return", 0x0d}, 118 {"circumflex", 0x5e}, 119 {"circumflex-accent", 0x5e}, 120 {"colon", 0x3a}, 121 {"comma", 0x2c}, 122 {"commercial-at", 0x40}, 123 {"d", 0x64}, 124 {"dollar-sign", 0x24}, 125 {"e", 0x65}, 126 {"eight", 0x38}, 127 {"equals-sign", 0x3d}, 128 {"exclamation-mark", 0x21}, 129 {"f", 0x66}, 130 {"five", 0x35}, 131 {"form-feed", 0x0c}, 132 {"four", 0x34}, 133 {"full-stop", 0x2e}, 134 {"g", 0x67}, 135 {"grave-accent", 0x60}, 136 {"greater-than-sign", 0x3e}, 137 {"h", 0x68}, 138 {"hyphen", 0x2d}, 139 {"hyphen-minus", 0x2d}, 140 {"i", 0x69}, 141 {"j", 0x6a}, 142 {"k", 0x6b}, 143 {"l", 0x6c}, 144 {"left-brace", 0x7b}, 145 {"left-curly-bracket", 0x7b}, 146 {"left-parenthesis", 0x28}, 147 {"left-square-bracket", 0x5b}, 148 {"less-than-sign", 0x3c}, 149 {"low-line", 0x5f}, 150 {"m", 0x6d}, 151 {"n", 0x6e}, 152 {"newline", 0x0a}, 153 {"nine", 0x39}, 154 {"number-sign", 0x23}, 155 {"o", 0x6f}, 156 {"one", 0x31}, 157 {"p", 0x70}, 158 {"percent-sign", 0x25}, 159 {"period", 0x2e}, 160 {"plus-sign", 0x2b}, 161 {"q", 0x71}, 162 {"question-mark", 0x3f}, 163 {"quotation-mark", 0x22}, 164 {"r", 0x72}, 165 {"reverse-solidus", 0x5c}, 166 {"right-brace", 0x7d}, 167 {"right-curly-bracket", 0x7d}, 168 {"right-parenthesis", 0x29}, 169 {"right-square-bracket", 0x5d}, 170 {"s", 0x73}, 171 {"semicolon", 0x3b}, 172 {"seven", 0x37}, 173 {"six", 0x36}, 174 {"slash", 0x2f}, 175 {"solidus", 0x2f}, 176 {"space", 0x20}, 177 {"t", 0x74}, 178 {"tab", 0x09}, 179 {"three", 0x33}, 180 {"tilde", 0x7e}, 181 {"two", 0x32}, 182 {"u", 0x75}, 183 {"underscore", 0x5f}, 184 {"v", 0x76}, 185 {"vertical-line", 0x7c}, 186 {"vertical-tab", 0x0b}, 187 {"w", 0x77}, 188 {"x", 0x78}, 189 {"y", 0x79}, 190 {"z", 0x7a}, 191 {"zero", 0x30} 192 }; 193 194 struct classnames 195 { 196 const char* elem_; 197 regex_traits<char>::char_class_type mask_; 198 }; 199 200 const classnames ClassNames[] = 201 { 202 {"alnum", ctype_base::alnum}, 203 {"alpha", ctype_base::alpha}, 204 {"blank", ctype_base::blank}, 205 {"cntrl", ctype_base::cntrl}, 206 {"d", ctype_base::digit}, 207 {"digit", ctype_base::digit}, 208 {"graph", ctype_base::graph}, 209 {"lower", ctype_base::lower}, 210 {"print", ctype_base::print}, 211 {"punct", ctype_base::punct}, 212 {"s", ctype_base::space}, 213 {"space", ctype_base::space}, 214 {"upper", ctype_base::upper}, 215 {"w", regex_traits<char>::__regex_word}, 216 {"xdigit", ctype_base::xdigit} 217 }; 218 219 struct use_strcmp 220 { 221 bool operator()(const collationnames& x, const char* y) 222 {return strcmp(x.elem_, y) < 0;} 223 bool operator()(const classnames& x, const char* y) 224 {return strcmp(x.elem_, y) < 0;} 225 }; 226 227 } 228 229 string 230 __get_collation_name(const char* s) 231 { 232 const collationnames* i = 233 _VSTD::lower_bound(begin(collatenames), end(collatenames), s, use_strcmp()); 234 string r; 235 if (i != end(collatenames) && strcmp(s, i->elem_) == 0) 236 r = char(i->char_); 237 return r; 238 } 239 240 regex_traits<char>::char_class_type 241 __get_classname(const char* s, bool __icase) 242 { 243 const classnames* i = 244 _VSTD::lower_bound(begin(ClassNames), end(ClassNames), s, use_strcmp()); 245 regex_traits<char>::char_class_type r = 0; 246 if (i != end(ClassNames) && strcmp(s, i->elem_) == 0) 247 { 248 r = i->mask_; 249 if (r == regex_traits<char>::__regex_word) 250 r |= ctype_base::alnum | ctype_base::upper | ctype_base::lower; 251 else if (__icase) 252 { 253 if (r & (ctype_base::lower | ctype_base::upper)) 254 r |= ctype_base::alpha; 255 } 256 } 257 return r; 258 } 259 260 template <> 261 void 262 __match_any_but_newline<char>::__exec(__state& __s) const 263 { 264 if (__s.__current_ != __s.__last_) 265 { 266 switch (*__s.__current_) 267 { 268 case '\r': 269 case '\n': 270 __s.__do_ = __state::__reject; 271 __s.__node_ = nullptr; 272 break; 273 default: 274 __s.__do_ = __state::__accept_and_consume; 275 ++__s.__current_; 276 __s.__node_ = this->first(); 277 break; 278 } 279 } 280 else 281 { 282 __s.__do_ = __state::__reject; 283 __s.__node_ = nullptr; 284 } 285 } 286 287 template <> 288 void 289 __match_any_but_newline<wchar_t>::__exec(__state& __s) const 290 { 291 if (__s.__current_ != __s.__last_) 292 { 293 switch (*__s.__current_) 294 { 295 case '\r': 296 case '\n': 297 case 0x2028: 298 case 0x2029: 299 __s.__do_ = __state::__reject; 300 __s.__node_ = nullptr; 301 break; 302 default: 303 __s.__do_ = __state::__accept_and_consume; 304 ++__s.__current_; 305 __s.__node_ = this->first(); 306 break; 307 } 308 } 309 else 310 { 311 __s.__do_ = __state::__reject; 312 __s.__node_ = nullptr; 313 } 314 } 315 316 _LIBCPP_END_NAMESPACE_STD 317