xref: /freebsd/contrib/llvm-project/libcxx/src/regex.cpp (revision 480093f4440d54b30b3025afeac24b48f2ba7a2e)
10b57cec5SDimitry Andric //===-------------------------- regex.cpp ---------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric 
90b57cec5SDimitry Andric #include "regex"
100b57cec5SDimitry Andric #include "algorithm"
110b57cec5SDimitry Andric #include "iterator"
120b57cec5SDimitry Andric 
130b57cec5SDimitry Andric _LIBCPP_BEGIN_NAMESPACE_STD
140b57cec5SDimitry Andric 
150b57cec5SDimitry Andric static
160b57cec5SDimitry Andric const char*
170b57cec5SDimitry Andric make_error_type_string(regex_constants::error_type ecode)
180b57cec5SDimitry Andric {
190b57cec5SDimitry Andric     switch (ecode)
200b57cec5SDimitry Andric     {
210b57cec5SDimitry Andric     case regex_constants::error_collate:
220b57cec5SDimitry Andric         return "The expression contained an invalid collating element name.";
230b57cec5SDimitry Andric     case regex_constants::error_ctype:
240b57cec5SDimitry Andric         return "The expression contained an invalid character class name.";
250b57cec5SDimitry Andric     case regex_constants::error_escape:
260b57cec5SDimitry Andric         return "The expression contained an invalid escaped character, or a "
270b57cec5SDimitry Andric                "trailing escape.";
280b57cec5SDimitry Andric     case regex_constants::error_backref:
290b57cec5SDimitry Andric         return "The expression contained an invalid back reference.";
300b57cec5SDimitry Andric     case regex_constants::error_brack:
310b57cec5SDimitry Andric         return "The expression contained mismatched [ and ].";
320b57cec5SDimitry Andric     case regex_constants::error_paren:
330b57cec5SDimitry Andric         return "The expression contained mismatched ( and ).";
340b57cec5SDimitry Andric     case regex_constants::error_brace:
350b57cec5SDimitry Andric         return "The expression contained mismatched { and }.";
360b57cec5SDimitry Andric     case regex_constants::error_badbrace:
370b57cec5SDimitry Andric         return "The expression contained an invalid range in a {} expression.";
380b57cec5SDimitry Andric     case regex_constants::error_range:
390b57cec5SDimitry Andric         return "The expression contained an invalid character range, "
400b57cec5SDimitry Andric                "such as [b-a] in most encodings.";
410b57cec5SDimitry Andric     case regex_constants::error_space:
420b57cec5SDimitry Andric         return "There was insufficient memory to convert the expression into "
430b57cec5SDimitry Andric                "a finite state machine.";
440b57cec5SDimitry Andric     case regex_constants::error_badrepeat:
450b57cec5SDimitry Andric         return "One of *?+{ was not preceded by a valid regular expression.";
460b57cec5SDimitry Andric     case regex_constants::error_complexity:
470b57cec5SDimitry Andric         return "The complexity of an attempted match against a regular "
480b57cec5SDimitry Andric                "expression exceeded a pre-set level.";
490b57cec5SDimitry Andric     case regex_constants::error_stack:
500b57cec5SDimitry Andric         return "There was insufficient memory to determine whether the regular "
510b57cec5SDimitry Andric                "expression could match the specified character sequence.";
520b57cec5SDimitry Andric     case regex_constants::__re_err_grammar:
530b57cec5SDimitry Andric         return "An invalid regex grammar has been requested.";
540b57cec5SDimitry Andric     case regex_constants::__re_err_empty:
550b57cec5SDimitry Andric         return "An empty regex is not allowed in the POSIX grammar.";
56*480093f4SDimitry Andric     case regex_constants::__re_err_parse:
57*480093f4SDimitry Andric         return "The parser did not consume the entire regular expression.";
580b57cec5SDimitry Andric     default:
590b57cec5SDimitry Andric         break;
600b57cec5SDimitry Andric     }
610b57cec5SDimitry Andric     return "Unknown error type";
620b57cec5SDimitry Andric }
630b57cec5SDimitry Andric 
640b57cec5SDimitry Andric regex_error::regex_error(regex_constants::error_type ecode)
650b57cec5SDimitry Andric     : runtime_error(make_error_type_string(ecode)),
660b57cec5SDimitry Andric       __code_(ecode)
670b57cec5SDimitry Andric {}
680b57cec5SDimitry Andric 
690b57cec5SDimitry Andric regex_error::~regex_error() throw() {}
700b57cec5SDimitry Andric 
710b57cec5SDimitry Andric namespace {
720b57cec5SDimitry Andric 
730b57cec5SDimitry Andric struct collationnames
740b57cec5SDimitry Andric {
750b57cec5SDimitry Andric     const char* elem_;
760b57cec5SDimitry Andric     char char_;
770b57cec5SDimitry Andric };
780b57cec5SDimitry Andric 
790b57cec5SDimitry Andric const collationnames collatenames[] =
800b57cec5SDimitry Andric {
810b57cec5SDimitry Andric     {"A", 0x41},
820b57cec5SDimitry Andric     {"B", 0x42},
830b57cec5SDimitry Andric     {"C", 0x43},
840b57cec5SDimitry Andric     {"D", 0x44},
850b57cec5SDimitry Andric     {"E", 0x45},
860b57cec5SDimitry Andric     {"F", 0x46},
870b57cec5SDimitry Andric     {"G", 0x47},
880b57cec5SDimitry Andric     {"H", 0x48},
890b57cec5SDimitry Andric     {"I", 0x49},
900b57cec5SDimitry Andric     {"J", 0x4a},
910b57cec5SDimitry Andric     {"K", 0x4b},
920b57cec5SDimitry Andric     {"L", 0x4c},
930b57cec5SDimitry Andric     {"M", 0x4d},
940b57cec5SDimitry Andric     {"N", 0x4e},
950b57cec5SDimitry Andric     {"NUL", 0x00},
960b57cec5SDimitry Andric     {"O", 0x4f},
970b57cec5SDimitry Andric     {"P", 0x50},
980b57cec5SDimitry Andric     {"Q", 0x51},
990b57cec5SDimitry Andric     {"R", 0x52},
1000b57cec5SDimitry Andric     {"S", 0x53},
1010b57cec5SDimitry Andric     {"T", 0x54},
1020b57cec5SDimitry Andric     {"U", 0x55},
1030b57cec5SDimitry Andric     {"V", 0x56},
1040b57cec5SDimitry Andric     {"W", 0x57},
1050b57cec5SDimitry Andric     {"X", 0x58},
1060b57cec5SDimitry Andric     {"Y", 0x59},
1070b57cec5SDimitry Andric     {"Z", 0x5a},
1080b57cec5SDimitry Andric     {"a", 0x61},
1090b57cec5SDimitry Andric     {"alert", 0x07},
1100b57cec5SDimitry Andric     {"ampersand", 0x26},
1110b57cec5SDimitry Andric     {"apostrophe", 0x27},
1120b57cec5SDimitry Andric     {"asterisk", 0x2a},
1130b57cec5SDimitry Andric     {"b", 0x62},
1140b57cec5SDimitry Andric     {"backslash", 0x5c},
1150b57cec5SDimitry Andric     {"backspace", 0x08},
1160b57cec5SDimitry Andric     {"c", 0x63},
1170b57cec5SDimitry Andric     {"carriage-return", 0x0d},
1180b57cec5SDimitry Andric     {"circumflex", 0x5e},
1190b57cec5SDimitry Andric     {"circumflex-accent", 0x5e},
1200b57cec5SDimitry Andric     {"colon", 0x3a},
1210b57cec5SDimitry Andric     {"comma", 0x2c},
1220b57cec5SDimitry Andric     {"commercial-at", 0x40},
1230b57cec5SDimitry Andric     {"d", 0x64},
1240b57cec5SDimitry Andric     {"dollar-sign", 0x24},
1250b57cec5SDimitry Andric     {"e", 0x65},
1260b57cec5SDimitry Andric     {"eight", 0x38},
1270b57cec5SDimitry Andric     {"equals-sign", 0x3d},
1280b57cec5SDimitry Andric     {"exclamation-mark", 0x21},
1290b57cec5SDimitry Andric     {"f", 0x66},
1300b57cec5SDimitry Andric     {"five", 0x35},
1310b57cec5SDimitry Andric     {"form-feed", 0x0c},
1320b57cec5SDimitry Andric     {"four", 0x34},
1330b57cec5SDimitry Andric     {"full-stop", 0x2e},
1340b57cec5SDimitry Andric     {"g", 0x67},
1350b57cec5SDimitry Andric     {"grave-accent", 0x60},
1360b57cec5SDimitry Andric     {"greater-than-sign", 0x3e},
1370b57cec5SDimitry Andric     {"h", 0x68},
1380b57cec5SDimitry Andric     {"hyphen", 0x2d},
1390b57cec5SDimitry Andric     {"hyphen-minus", 0x2d},
1400b57cec5SDimitry Andric     {"i", 0x69},
1410b57cec5SDimitry Andric     {"j", 0x6a},
1420b57cec5SDimitry Andric     {"k", 0x6b},
1430b57cec5SDimitry Andric     {"l", 0x6c},
1440b57cec5SDimitry Andric     {"left-brace", 0x7b},
1450b57cec5SDimitry Andric     {"left-curly-bracket", 0x7b},
1460b57cec5SDimitry Andric     {"left-parenthesis", 0x28},
1470b57cec5SDimitry Andric     {"left-square-bracket", 0x5b},
1480b57cec5SDimitry Andric     {"less-than-sign", 0x3c},
1490b57cec5SDimitry Andric     {"low-line", 0x5f},
1500b57cec5SDimitry Andric     {"m", 0x6d},
1510b57cec5SDimitry Andric     {"n", 0x6e},
1520b57cec5SDimitry Andric     {"newline", 0x0a},
1530b57cec5SDimitry Andric     {"nine", 0x39},
1540b57cec5SDimitry Andric     {"number-sign", 0x23},
1550b57cec5SDimitry Andric     {"o", 0x6f},
1560b57cec5SDimitry Andric     {"one", 0x31},
1570b57cec5SDimitry Andric     {"p", 0x70},
1580b57cec5SDimitry Andric     {"percent-sign", 0x25},
1590b57cec5SDimitry Andric     {"period", 0x2e},
1600b57cec5SDimitry Andric     {"plus-sign", 0x2b},
1610b57cec5SDimitry Andric     {"q", 0x71},
1620b57cec5SDimitry Andric     {"question-mark", 0x3f},
1630b57cec5SDimitry Andric     {"quotation-mark", 0x22},
1640b57cec5SDimitry Andric     {"r", 0x72},
1650b57cec5SDimitry Andric     {"reverse-solidus", 0x5c},
1660b57cec5SDimitry Andric     {"right-brace", 0x7d},
1670b57cec5SDimitry Andric     {"right-curly-bracket", 0x7d},
1680b57cec5SDimitry Andric     {"right-parenthesis", 0x29},
1690b57cec5SDimitry Andric     {"right-square-bracket", 0x5d},
1700b57cec5SDimitry Andric     {"s", 0x73},
1710b57cec5SDimitry Andric     {"semicolon", 0x3b},
1720b57cec5SDimitry Andric     {"seven", 0x37},
1730b57cec5SDimitry Andric     {"six", 0x36},
1740b57cec5SDimitry Andric     {"slash", 0x2f},
1750b57cec5SDimitry Andric     {"solidus", 0x2f},
1760b57cec5SDimitry Andric     {"space", 0x20},
1770b57cec5SDimitry Andric     {"t", 0x74},
1780b57cec5SDimitry Andric     {"tab", 0x09},
1790b57cec5SDimitry Andric     {"three", 0x33},
1800b57cec5SDimitry Andric     {"tilde", 0x7e},
1810b57cec5SDimitry Andric     {"two", 0x32},
1820b57cec5SDimitry Andric     {"u", 0x75},
1830b57cec5SDimitry Andric     {"underscore", 0x5f},
1840b57cec5SDimitry Andric     {"v", 0x76},
1850b57cec5SDimitry Andric     {"vertical-line", 0x7c},
1860b57cec5SDimitry Andric     {"vertical-tab", 0x0b},
1870b57cec5SDimitry Andric     {"w", 0x77},
1880b57cec5SDimitry Andric     {"x", 0x78},
1890b57cec5SDimitry Andric     {"y", 0x79},
1900b57cec5SDimitry Andric     {"z", 0x7a},
1910b57cec5SDimitry Andric     {"zero", 0x30}
1920b57cec5SDimitry Andric };
1930b57cec5SDimitry Andric 
1940b57cec5SDimitry Andric struct classnames
1950b57cec5SDimitry Andric {
1960b57cec5SDimitry Andric     const char* elem_;
1970b57cec5SDimitry Andric     regex_traits<char>::char_class_type mask_;
1980b57cec5SDimitry Andric };
1990b57cec5SDimitry Andric 
2000b57cec5SDimitry Andric const classnames ClassNames[] =
2010b57cec5SDimitry Andric {
2020b57cec5SDimitry Andric     {"alnum",  ctype_base::alnum},
2030b57cec5SDimitry Andric     {"alpha",  ctype_base::alpha},
2040b57cec5SDimitry Andric     {"blank",  ctype_base::blank},
2050b57cec5SDimitry Andric     {"cntrl",  ctype_base::cntrl},
2060b57cec5SDimitry Andric     {"d",      ctype_base::digit},
2070b57cec5SDimitry Andric     {"digit",  ctype_base::digit},
2080b57cec5SDimitry Andric     {"graph",  ctype_base::graph},
2090b57cec5SDimitry Andric     {"lower",  ctype_base::lower},
2100b57cec5SDimitry Andric     {"print",  ctype_base::print},
2110b57cec5SDimitry Andric     {"punct",  ctype_base::punct},
2120b57cec5SDimitry Andric     {"s",      ctype_base::space},
2130b57cec5SDimitry Andric     {"space",  ctype_base::space},
2140b57cec5SDimitry Andric     {"upper",  ctype_base::upper},
2150b57cec5SDimitry Andric     {"w",      regex_traits<char>::__regex_word},
2160b57cec5SDimitry Andric     {"xdigit", ctype_base::xdigit}
2170b57cec5SDimitry Andric };
2180b57cec5SDimitry Andric 
2190b57cec5SDimitry Andric struct use_strcmp
2200b57cec5SDimitry Andric {
2210b57cec5SDimitry Andric     bool operator()(const collationnames& x, const char* y)
2220b57cec5SDimitry Andric         {return strcmp(x.elem_, y) < 0;}
2230b57cec5SDimitry Andric     bool operator()(const classnames& x, const char* y)
2240b57cec5SDimitry Andric         {return strcmp(x.elem_, y) < 0;}
2250b57cec5SDimitry Andric };
2260b57cec5SDimitry Andric 
2270b57cec5SDimitry Andric }
2280b57cec5SDimitry Andric 
2290b57cec5SDimitry Andric string
2300b57cec5SDimitry Andric __get_collation_name(const char* s)
2310b57cec5SDimitry Andric {
2320b57cec5SDimitry Andric     const collationnames* i =
2330b57cec5SDimitry Andric             _VSTD::lower_bound(begin(collatenames), end(collatenames), s, use_strcmp());
2340b57cec5SDimitry Andric     string r;
2350b57cec5SDimitry Andric     if (i != end(collatenames) && strcmp(s, i->elem_) == 0)
2360b57cec5SDimitry Andric         r = char(i->char_);
2370b57cec5SDimitry Andric     return r;
2380b57cec5SDimitry Andric }
2390b57cec5SDimitry Andric 
2400b57cec5SDimitry Andric regex_traits<char>::char_class_type
2410b57cec5SDimitry Andric __get_classname(const char* s, bool __icase)
2420b57cec5SDimitry Andric {
2430b57cec5SDimitry Andric     const classnames* i =
2440b57cec5SDimitry Andric             _VSTD::lower_bound(begin(ClassNames), end(ClassNames), s, use_strcmp());
2450b57cec5SDimitry Andric     regex_traits<char>::char_class_type r = 0;
2460b57cec5SDimitry Andric     if (i != end(ClassNames) && strcmp(s, i->elem_) == 0)
2470b57cec5SDimitry Andric     {
2480b57cec5SDimitry Andric         r = i->mask_;
2490b57cec5SDimitry Andric         if (r == regex_traits<char>::__regex_word)
2500b57cec5SDimitry Andric             r |= ctype_base::alnum | ctype_base::upper | ctype_base::lower;
2510b57cec5SDimitry Andric         else if (__icase)
2520b57cec5SDimitry Andric         {
2530b57cec5SDimitry Andric             if (r & (ctype_base::lower | ctype_base::upper))
2540b57cec5SDimitry Andric                 r |= ctype_base::alpha;
2550b57cec5SDimitry Andric         }
2560b57cec5SDimitry Andric     }
2570b57cec5SDimitry Andric     return r;
2580b57cec5SDimitry Andric }
2590b57cec5SDimitry Andric 
2600b57cec5SDimitry Andric template <>
2610b57cec5SDimitry Andric void
2620b57cec5SDimitry Andric __match_any_but_newline<char>::__exec(__state& __s) const
2630b57cec5SDimitry Andric {
2640b57cec5SDimitry Andric     if (__s.__current_ != __s.__last_)
2650b57cec5SDimitry Andric     {
2660b57cec5SDimitry Andric         switch (*__s.__current_)
2670b57cec5SDimitry Andric         {
2680b57cec5SDimitry Andric         case '\r':
2690b57cec5SDimitry Andric         case '\n':
2700b57cec5SDimitry Andric             __s.__do_ = __state::__reject;
2710b57cec5SDimitry Andric             __s.__node_ = nullptr;
2720b57cec5SDimitry Andric             break;
2730b57cec5SDimitry Andric         default:
2740b57cec5SDimitry Andric             __s.__do_ = __state::__accept_and_consume;
2750b57cec5SDimitry Andric             ++__s.__current_;
2760b57cec5SDimitry Andric             __s.__node_ = this->first();
2770b57cec5SDimitry Andric             break;
2780b57cec5SDimitry Andric         }
2790b57cec5SDimitry Andric     }
2800b57cec5SDimitry Andric     else
2810b57cec5SDimitry Andric     {
2820b57cec5SDimitry Andric         __s.__do_ = __state::__reject;
2830b57cec5SDimitry Andric         __s.__node_ = nullptr;
2840b57cec5SDimitry Andric     }
2850b57cec5SDimitry Andric }
2860b57cec5SDimitry Andric 
2870b57cec5SDimitry Andric template <>
2880b57cec5SDimitry Andric void
2890b57cec5SDimitry Andric __match_any_but_newline<wchar_t>::__exec(__state& __s) const
2900b57cec5SDimitry Andric {
2910b57cec5SDimitry Andric     if (__s.__current_ != __s.__last_)
2920b57cec5SDimitry Andric     {
2930b57cec5SDimitry Andric         switch (*__s.__current_)
2940b57cec5SDimitry Andric         {
2950b57cec5SDimitry Andric         case '\r':
2960b57cec5SDimitry Andric         case '\n':
2970b57cec5SDimitry Andric         case 0x2028:
2980b57cec5SDimitry Andric         case 0x2029:
2990b57cec5SDimitry Andric             __s.__do_ = __state::__reject;
3000b57cec5SDimitry Andric             __s.__node_ = nullptr;
3010b57cec5SDimitry Andric             break;
3020b57cec5SDimitry Andric         default:
3030b57cec5SDimitry Andric             __s.__do_ = __state::__accept_and_consume;
3040b57cec5SDimitry Andric             ++__s.__current_;
3050b57cec5SDimitry Andric             __s.__node_ = this->first();
3060b57cec5SDimitry Andric             break;
3070b57cec5SDimitry Andric         }
3080b57cec5SDimitry Andric     }
3090b57cec5SDimitry Andric     else
3100b57cec5SDimitry Andric     {
3110b57cec5SDimitry Andric         __s.__do_ = __state::__reject;
3120b57cec5SDimitry Andric         __s.__node_ = nullptr;
3130b57cec5SDimitry Andric     }
3140b57cec5SDimitry Andric }
3150b57cec5SDimitry Andric 
3160b57cec5SDimitry Andric _LIBCPP_END_NAMESPACE_STD
317