xref: /freebsd/contrib/llvm-project/libcxx/src/regex.cpp (revision 0b57cec536236d46e3dba9bd041533462f33dbb7)
1*0b57cec5SDimitry Andric //===-------------------------- regex.cpp ---------------------------------===//
2*0b57cec5SDimitry Andric //
3*0b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*0b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*0b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*0b57cec5SDimitry Andric //
7*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
8*0b57cec5SDimitry Andric 
9*0b57cec5SDimitry Andric #include "regex"
10*0b57cec5SDimitry Andric #include "algorithm"
11*0b57cec5SDimitry Andric #include "iterator"
12*0b57cec5SDimitry Andric 
13*0b57cec5SDimitry Andric _LIBCPP_BEGIN_NAMESPACE_STD
14*0b57cec5SDimitry Andric 
15*0b57cec5SDimitry Andric static
16*0b57cec5SDimitry Andric const char*
17*0b57cec5SDimitry Andric make_error_type_string(regex_constants::error_type ecode)
18*0b57cec5SDimitry Andric {
19*0b57cec5SDimitry Andric     switch (ecode)
20*0b57cec5SDimitry Andric     {
21*0b57cec5SDimitry Andric     case regex_constants::error_collate:
22*0b57cec5SDimitry Andric         return "The expression contained an invalid collating element name.";
23*0b57cec5SDimitry Andric     case regex_constants::error_ctype:
24*0b57cec5SDimitry Andric         return "The expression contained an invalid character class name.";
25*0b57cec5SDimitry Andric     case regex_constants::error_escape:
26*0b57cec5SDimitry Andric         return "The expression contained an invalid escaped character, or a "
27*0b57cec5SDimitry Andric                "trailing escape.";
28*0b57cec5SDimitry Andric     case regex_constants::error_backref:
29*0b57cec5SDimitry Andric         return "The expression contained an invalid back reference.";
30*0b57cec5SDimitry Andric     case regex_constants::error_brack:
31*0b57cec5SDimitry Andric         return "The expression contained mismatched [ and ].";
32*0b57cec5SDimitry Andric     case regex_constants::error_paren:
33*0b57cec5SDimitry Andric         return "The expression contained mismatched ( and ).";
34*0b57cec5SDimitry Andric     case regex_constants::error_brace:
35*0b57cec5SDimitry Andric         return "The expression contained mismatched { and }.";
36*0b57cec5SDimitry Andric     case regex_constants::error_badbrace:
37*0b57cec5SDimitry Andric         return "The expression contained an invalid range in a {} expression.";
38*0b57cec5SDimitry Andric     case regex_constants::error_range:
39*0b57cec5SDimitry Andric         return "The expression contained an invalid character range, "
40*0b57cec5SDimitry Andric                "such as [b-a] in most encodings.";
41*0b57cec5SDimitry Andric     case regex_constants::error_space:
42*0b57cec5SDimitry Andric         return "There was insufficient memory to convert the expression into "
43*0b57cec5SDimitry Andric                "a finite state machine.";
44*0b57cec5SDimitry Andric     case regex_constants::error_badrepeat:
45*0b57cec5SDimitry Andric         return "One of *?+{ was not preceded by a valid regular expression.";
46*0b57cec5SDimitry Andric     case regex_constants::error_complexity:
47*0b57cec5SDimitry Andric         return "The complexity of an attempted match against a regular "
48*0b57cec5SDimitry Andric                "expression exceeded a pre-set level.";
49*0b57cec5SDimitry Andric     case regex_constants::error_stack:
50*0b57cec5SDimitry Andric         return "There was insufficient memory to determine whether the regular "
51*0b57cec5SDimitry Andric                "expression could match the specified character sequence.";
52*0b57cec5SDimitry Andric     case regex_constants::__re_err_grammar:
53*0b57cec5SDimitry Andric         return "An invalid regex grammar has been requested.";
54*0b57cec5SDimitry Andric     case regex_constants::__re_err_empty:
55*0b57cec5SDimitry Andric         return "An empty regex is not allowed in the POSIX grammar.";
56*0b57cec5SDimitry Andric     default:
57*0b57cec5SDimitry Andric         break;
58*0b57cec5SDimitry Andric     }
59*0b57cec5SDimitry Andric     return "Unknown error type";
60*0b57cec5SDimitry Andric }
61*0b57cec5SDimitry Andric 
62*0b57cec5SDimitry Andric regex_error::regex_error(regex_constants::error_type ecode)
63*0b57cec5SDimitry Andric     : runtime_error(make_error_type_string(ecode)),
64*0b57cec5SDimitry Andric       __code_(ecode)
65*0b57cec5SDimitry Andric {}
66*0b57cec5SDimitry Andric 
67*0b57cec5SDimitry Andric regex_error::~regex_error() throw() {}
68*0b57cec5SDimitry Andric 
69*0b57cec5SDimitry Andric namespace {
70*0b57cec5SDimitry Andric 
71*0b57cec5SDimitry Andric struct collationnames
72*0b57cec5SDimitry Andric {
73*0b57cec5SDimitry Andric     const char* elem_;
74*0b57cec5SDimitry Andric     char char_;
75*0b57cec5SDimitry Andric };
76*0b57cec5SDimitry Andric 
77*0b57cec5SDimitry Andric const collationnames collatenames[] =
78*0b57cec5SDimitry Andric {
79*0b57cec5SDimitry Andric     {"A", 0x41},
80*0b57cec5SDimitry Andric     {"B", 0x42},
81*0b57cec5SDimitry Andric     {"C", 0x43},
82*0b57cec5SDimitry Andric     {"D", 0x44},
83*0b57cec5SDimitry Andric     {"E", 0x45},
84*0b57cec5SDimitry Andric     {"F", 0x46},
85*0b57cec5SDimitry Andric     {"G", 0x47},
86*0b57cec5SDimitry Andric     {"H", 0x48},
87*0b57cec5SDimitry Andric     {"I", 0x49},
88*0b57cec5SDimitry Andric     {"J", 0x4a},
89*0b57cec5SDimitry Andric     {"K", 0x4b},
90*0b57cec5SDimitry Andric     {"L", 0x4c},
91*0b57cec5SDimitry Andric     {"M", 0x4d},
92*0b57cec5SDimitry Andric     {"N", 0x4e},
93*0b57cec5SDimitry Andric     {"NUL", 0x00},
94*0b57cec5SDimitry Andric     {"O", 0x4f},
95*0b57cec5SDimitry Andric     {"P", 0x50},
96*0b57cec5SDimitry Andric     {"Q", 0x51},
97*0b57cec5SDimitry Andric     {"R", 0x52},
98*0b57cec5SDimitry Andric     {"S", 0x53},
99*0b57cec5SDimitry Andric     {"T", 0x54},
100*0b57cec5SDimitry Andric     {"U", 0x55},
101*0b57cec5SDimitry Andric     {"V", 0x56},
102*0b57cec5SDimitry Andric     {"W", 0x57},
103*0b57cec5SDimitry Andric     {"X", 0x58},
104*0b57cec5SDimitry Andric     {"Y", 0x59},
105*0b57cec5SDimitry Andric     {"Z", 0x5a},
106*0b57cec5SDimitry Andric     {"a", 0x61},
107*0b57cec5SDimitry Andric     {"alert", 0x07},
108*0b57cec5SDimitry Andric     {"ampersand", 0x26},
109*0b57cec5SDimitry Andric     {"apostrophe", 0x27},
110*0b57cec5SDimitry Andric     {"asterisk", 0x2a},
111*0b57cec5SDimitry Andric     {"b", 0x62},
112*0b57cec5SDimitry Andric     {"backslash", 0x5c},
113*0b57cec5SDimitry Andric     {"backspace", 0x08},
114*0b57cec5SDimitry Andric     {"c", 0x63},
115*0b57cec5SDimitry Andric     {"carriage-return", 0x0d},
116*0b57cec5SDimitry Andric     {"circumflex", 0x5e},
117*0b57cec5SDimitry Andric     {"circumflex-accent", 0x5e},
118*0b57cec5SDimitry Andric     {"colon", 0x3a},
119*0b57cec5SDimitry Andric     {"comma", 0x2c},
120*0b57cec5SDimitry Andric     {"commercial-at", 0x40},
121*0b57cec5SDimitry Andric     {"d", 0x64},
122*0b57cec5SDimitry Andric     {"dollar-sign", 0x24},
123*0b57cec5SDimitry Andric     {"e", 0x65},
124*0b57cec5SDimitry Andric     {"eight", 0x38},
125*0b57cec5SDimitry Andric     {"equals-sign", 0x3d},
126*0b57cec5SDimitry Andric     {"exclamation-mark", 0x21},
127*0b57cec5SDimitry Andric     {"f", 0x66},
128*0b57cec5SDimitry Andric     {"five", 0x35},
129*0b57cec5SDimitry Andric     {"form-feed", 0x0c},
130*0b57cec5SDimitry Andric     {"four", 0x34},
131*0b57cec5SDimitry Andric     {"full-stop", 0x2e},
132*0b57cec5SDimitry Andric     {"g", 0x67},
133*0b57cec5SDimitry Andric     {"grave-accent", 0x60},
134*0b57cec5SDimitry Andric     {"greater-than-sign", 0x3e},
135*0b57cec5SDimitry Andric     {"h", 0x68},
136*0b57cec5SDimitry Andric     {"hyphen", 0x2d},
137*0b57cec5SDimitry Andric     {"hyphen-minus", 0x2d},
138*0b57cec5SDimitry Andric     {"i", 0x69},
139*0b57cec5SDimitry Andric     {"j", 0x6a},
140*0b57cec5SDimitry Andric     {"k", 0x6b},
141*0b57cec5SDimitry Andric     {"l", 0x6c},
142*0b57cec5SDimitry Andric     {"left-brace", 0x7b},
143*0b57cec5SDimitry Andric     {"left-curly-bracket", 0x7b},
144*0b57cec5SDimitry Andric     {"left-parenthesis", 0x28},
145*0b57cec5SDimitry Andric     {"left-square-bracket", 0x5b},
146*0b57cec5SDimitry Andric     {"less-than-sign", 0x3c},
147*0b57cec5SDimitry Andric     {"low-line", 0x5f},
148*0b57cec5SDimitry Andric     {"m", 0x6d},
149*0b57cec5SDimitry Andric     {"n", 0x6e},
150*0b57cec5SDimitry Andric     {"newline", 0x0a},
151*0b57cec5SDimitry Andric     {"nine", 0x39},
152*0b57cec5SDimitry Andric     {"number-sign", 0x23},
153*0b57cec5SDimitry Andric     {"o", 0x6f},
154*0b57cec5SDimitry Andric     {"one", 0x31},
155*0b57cec5SDimitry Andric     {"p", 0x70},
156*0b57cec5SDimitry Andric     {"percent-sign", 0x25},
157*0b57cec5SDimitry Andric     {"period", 0x2e},
158*0b57cec5SDimitry Andric     {"plus-sign", 0x2b},
159*0b57cec5SDimitry Andric     {"q", 0x71},
160*0b57cec5SDimitry Andric     {"question-mark", 0x3f},
161*0b57cec5SDimitry Andric     {"quotation-mark", 0x22},
162*0b57cec5SDimitry Andric     {"r", 0x72},
163*0b57cec5SDimitry Andric     {"reverse-solidus", 0x5c},
164*0b57cec5SDimitry Andric     {"right-brace", 0x7d},
165*0b57cec5SDimitry Andric     {"right-curly-bracket", 0x7d},
166*0b57cec5SDimitry Andric     {"right-parenthesis", 0x29},
167*0b57cec5SDimitry Andric     {"right-square-bracket", 0x5d},
168*0b57cec5SDimitry Andric     {"s", 0x73},
169*0b57cec5SDimitry Andric     {"semicolon", 0x3b},
170*0b57cec5SDimitry Andric     {"seven", 0x37},
171*0b57cec5SDimitry Andric     {"six", 0x36},
172*0b57cec5SDimitry Andric     {"slash", 0x2f},
173*0b57cec5SDimitry Andric     {"solidus", 0x2f},
174*0b57cec5SDimitry Andric     {"space", 0x20},
175*0b57cec5SDimitry Andric     {"t", 0x74},
176*0b57cec5SDimitry Andric     {"tab", 0x09},
177*0b57cec5SDimitry Andric     {"three", 0x33},
178*0b57cec5SDimitry Andric     {"tilde", 0x7e},
179*0b57cec5SDimitry Andric     {"two", 0x32},
180*0b57cec5SDimitry Andric     {"u", 0x75},
181*0b57cec5SDimitry Andric     {"underscore", 0x5f},
182*0b57cec5SDimitry Andric     {"v", 0x76},
183*0b57cec5SDimitry Andric     {"vertical-line", 0x7c},
184*0b57cec5SDimitry Andric     {"vertical-tab", 0x0b},
185*0b57cec5SDimitry Andric     {"w", 0x77},
186*0b57cec5SDimitry Andric     {"x", 0x78},
187*0b57cec5SDimitry Andric     {"y", 0x79},
188*0b57cec5SDimitry Andric     {"z", 0x7a},
189*0b57cec5SDimitry Andric     {"zero", 0x30}
190*0b57cec5SDimitry Andric };
191*0b57cec5SDimitry Andric 
192*0b57cec5SDimitry Andric struct classnames
193*0b57cec5SDimitry Andric {
194*0b57cec5SDimitry Andric     const char* elem_;
195*0b57cec5SDimitry Andric     regex_traits<char>::char_class_type mask_;
196*0b57cec5SDimitry Andric };
197*0b57cec5SDimitry Andric 
198*0b57cec5SDimitry Andric const classnames ClassNames[] =
199*0b57cec5SDimitry Andric {
200*0b57cec5SDimitry Andric     {"alnum",  ctype_base::alnum},
201*0b57cec5SDimitry Andric     {"alpha",  ctype_base::alpha},
202*0b57cec5SDimitry Andric     {"blank",  ctype_base::blank},
203*0b57cec5SDimitry Andric     {"cntrl",  ctype_base::cntrl},
204*0b57cec5SDimitry Andric     {"d",      ctype_base::digit},
205*0b57cec5SDimitry Andric     {"digit",  ctype_base::digit},
206*0b57cec5SDimitry Andric     {"graph",  ctype_base::graph},
207*0b57cec5SDimitry Andric     {"lower",  ctype_base::lower},
208*0b57cec5SDimitry Andric     {"print",  ctype_base::print},
209*0b57cec5SDimitry Andric     {"punct",  ctype_base::punct},
210*0b57cec5SDimitry Andric     {"s",      ctype_base::space},
211*0b57cec5SDimitry Andric     {"space",  ctype_base::space},
212*0b57cec5SDimitry Andric     {"upper",  ctype_base::upper},
213*0b57cec5SDimitry Andric     {"w",      regex_traits<char>::__regex_word},
214*0b57cec5SDimitry Andric     {"xdigit", ctype_base::xdigit}
215*0b57cec5SDimitry Andric };
216*0b57cec5SDimitry Andric 
217*0b57cec5SDimitry Andric struct use_strcmp
218*0b57cec5SDimitry Andric {
219*0b57cec5SDimitry Andric     bool operator()(const collationnames& x, const char* y)
220*0b57cec5SDimitry Andric         {return strcmp(x.elem_, y) < 0;}
221*0b57cec5SDimitry Andric     bool operator()(const classnames& x, const char* y)
222*0b57cec5SDimitry Andric         {return strcmp(x.elem_, y) < 0;}
223*0b57cec5SDimitry Andric };
224*0b57cec5SDimitry Andric 
225*0b57cec5SDimitry Andric }
226*0b57cec5SDimitry Andric 
227*0b57cec5SDimitry Andric string
228*0b57cec5SDimitry Andric __get_collation_name(const char* s)
229*0b57cec5SDimitry Andric {
230*0b57cec5SDimitry Andric     const collationnames* i =
231*0b57cec5SDimitry Andric             _VSTD::lower_bound(begin(collatenames), end(collatenames), s, use_strcmp());
232*0b57cec5SDimitry Andric     string r;
233*0b57cec5SDimitry Andric     if (i != end(collatenames) && strcmp(s, i->elem_) == 0)
234*0b57cec5SDimitry Andric         r = char(i->char_);
235*0b57cec5SDimitry Andric     return r;
236*0b57cec5SDimitry Andric }
237*0b57cec5SDimitry Andric 
238*0b57cec5SDimitry Andric regex_traits<char>::char_class_type
239*0b57cec5SDimitry Andric __get_classname(const char* s, bool __icase)
240*0b57cec5SDimitry Andric {
241*0b57cec5SDimitry Andric     const classnames* i =
242*0b57cec5SDimitry Andric             _VSTD::lower_bound(begin(ClassNames), end(ClassNames), s, use_strcmp());
243*0b57cec5SDimitry Andric     regex_traits<char>::char_class_type r = 0;
244*0b57cec5SDimitry Andric     if (i != end(ClassNames) && strcmp(s, i->elem_) == 0)
245*0b57cec5SDimitry Andric     {
246*0b57cec5SDimitry Andric         r = i->mask_;
247*0b57cec5SDimitry Andric         if (r == regex_traits<char>::__regex_word)
248*0b57cec5SDimitry Andric             r |= ctype_base::alnum | ctype_base::upper | ctype_base::lower;
249*0b57cec5SDimitry Andric         else if (__icase)
250*0b57cec5SDimitry Andric         {
251*0b57cec5SDimitry Andric             if (r & (ctype_base::lower | ctype_base::upper))
252*0b57cec5SDimitry Andric                 r |= ctype_base::alpha;
253*0b57cec5SDimitry Andric         }
254*0b57cec5SDimitry Andric     }
255*0b57cec5SDimitry Andric     return r;
256*0b57cec5SDimitry Andric }
257*0b57cec5SDimitry Andric 
258*0b57cec5SDimitry Andric template <>
259*0b57cec5SDimitry Andric void
260*0b57cec5SDimitry Andric __match_any_but_newline<char>::__exec(__state& __s) const
261*0b57cec5SDimitry Andric {
262*0b57cec5SDimitry Andric     if (__s.__current_ != __s.__last_)
263*0b57cec5SDimitry Andric     {
264*0b57cec5SDimitry Andric         switch (*__s.__current_)
265*0b57cec5SDimitry Andric         {
266*0b57cec5SDimitry Andric         case '\r':
267*0b57cec5SDimitry Andric         case '\n':
268*0b57cec5SDimitry Andric             __s.__do_ = __state::__reject;
269*0b57cec5SDimitry Andric             __s.__node_ = nullptr;
270*0b57cec5SDimitry Andric             break;
271*0b57cec5SDimitry Andric         default:
272*0b57cec5SDimitry Andric             __s.__do_ = __state::__accept_and_consume;
273*0b57cec5SDimitry Andric             ++__s.__current_;
274*0b57cec5SDimitry Andric             __s.__node_ = this->first();
275*0b57cec5SDimitry Andric             break;
276*0b57cec5SDimitry Andric         }
277*0b57cec5SDimitry Andric     }
278*0b57cec5SDimitry Andric     else
279*0b57cec5SDimitry Andric     {
280*0b57cec5SDimitry Andric         __s.__do_ = __state::__reject;
281*0b57cec5SDimitry Andric         __s.__node_ = nullptr;
282*0b57cec5SDimitry Andric     }
283*0b57cec5SDimitry Andric }
284*0b57cec5SDimitry Andric 
285*0b57cec5SDimitry Andric template <>
286*0b57cec5SDimitry Andric void
287*0b57cec5SDimitry Andric __match_any_but_newline<wchar_t>::__exec(__state& __s) const
288*0b57cec5SDimitry Andric {
289*0b57cec5SDimitry Andric     if (__s.__current_ != __s.__last_)
290*0b57cec5SDimitry Andric     {
291*0b57cec5SDimitry Andric         switch (*__s.__current_)
292*0b57cec5SDimitry Andric         {
293*0b57cec5SDimitry Andric         case '\r':
294*0b57cec5SDimitry Andric         case '\n':
295*0b57cec5SDimitry Andric         case 0x2028:
296*0b57cec5SDimitry Andric         case 0x2029:
297*0b57cec5SDimitry Andric             __s.__do_ = __state::__reject;
298*0b57cec5SDimitry Andric             __s.__node_ = nullptr;
299*0b57cec5SDimitry Andric             break;
300*0b57cec5SDimitry Andric         default:
301*0b57cec5SDimitry Andric             __s.__do_ = __state::__accept_and_consume;
302*0b57cec5SDimitry Andric             ++__s.__current_;
303*0b57cec5SDimitry Andric             __s.__node_ = this->first();
304*0b57cec5SDimitry Andric             break;
305*0b57cec5SDimitry Andric         }
306*0b57cec5SDimitry Andric     }
307*0b57cec5SDimitry Andric     else
308*0b57cec5SDimitry Andric     {
309*0b57cec5SDimitry Andric         __s.__do_ = __state::__reject;
310*0b57cec5SDimitry Andric         __s.__node_ = nullptr;
311*0b57cec5SDimitry Andric     }
312*0b57cec5SDimitry Andric }
313*0b57cec5SDimitry Andric 
314*0b57cec5SDimitry Andric _LIBCPP_END_NAMESPACE_STD
315