xref: /freebsd/contrib/kyua/utils/text/regex.cpp (revision 18054d0220cfc8df9c9568c437bd6fbb59d53c3c)
1 // Copyright 2014 The Kyua Authors.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 //   notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 //   notice, this list of conditions and the following disclaimer in the
12 //   documentation and/or other materials provided with the distribution.
13 // * Neither the name of Google Inc. nor the names of its contributors
14 //   may be used to endorse or promote products derived from this software
15 //   without specific prior written permission.
16 //
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 
29 #include "utils/text/regex.hpp"
30 
31 extern "C" {
32 #include <sys/types.h>
33 
34 #include <regex.h>
35 }
36 
37 #include "utils/auto_array.ipp"
38 #include "utils/defs.hpp"
39 #include "utils/format/macros.hpp"
40 #include "utils/noncopyable.hpp"
41 #include "utils/sanity.hpp"
42 #include "utils/text/exceptions.hpp"
43 
44 namespace text = utils::text;
45 
46 
47 namespace {
48 
49 
50 static void throw_regex_error(const int, const ::regex_t*, const std::string&)
51     UTILS_NORETURN;
52 
53 
54 /// Constructs and raises a regex_error.
55 ///
56 /// \param error The error code returned by regcomp(3) or regexec(3).
57 /// \param preg The native regex object that caused this error.
58 /// \param prefix Error message prefix string.
59 ///
60 /// \throw regex_error The constructed exception.
61 static void
62 throw_regex_error(const int error, const ::regex_t* preg,
63                   const std::string& prefix)
64 {
65     char buffer[1024];
66 
67     // TODO(jmmv): Would be nice to handle the case where the message does
68     // not fit in the temporary buffer.
69     (void)::regerror(error, preg, buffer, sizeof(buffer));
70 
71     throw text::regex_error(F("%s: %s") % prefix % buffer);
72 }
73 
74 
75 }  // anonymous namespace
76 
77 
78 /// Internal implementation for regex_matches.
79 struct utils::text::regex_matches::impl : utils::noncopyable {
80     /// String on which we are matching.
81     ///
82     /// In theory, we could take a reference here instead of a copy, and make
83     /// it a requirement for the caller to ensure that the lifecycle of the
84     /// input string outlasts the lifecycle of the regex_matches.  However, that
85     /// contract is very easy to break with hardcoded strings (as we do in
86     /// tests).  Just go for the safer case here.
87     const std::string _string;
88 
89     /// Maximum number of matching groups we expect, including the full match.
90     ///
91     /// In other words, this is the size of the _matches array.
92     const std::size_t _nmatches;
93 
94     /// Native regular expression match representation.
95     utils::auto_array< ::regmatch_t > _matches;
96 
97     /// Constructor.
98     ///
99     /// This executes the regex on the given string and sets up the internal
100     /// class state based on the results.
101     ///
102     /// \param preg The native regex object.
103     /// \param str The string on which to execute the regex.
104     /// \param ngroups Number of capture groups in the regex.  This is an upper
105     ///     bound and may be greater than the actual matches.
106     ///
107     /// \throw regex_error If the call to regexec(3) fails.
108     impl(const ::regex_t* preg, const std::string& str,
109          const std::size_t ngroups) :
110         _string(str),
111         _nmatches(ngroups + 1),
112         _matches(new ::regmatch_t[_nmatches])
113     {
114         const int error = ::regexec(preg, _string.c_str(), _nmatches,
115                                     _matches.get(), 0);
116         if (error == REG_NOMATCH) {
117             _matches.reset(NULL);
118         } else if (error != 0) {
119             throw_regex_error(error, preg,
120                               F("regexec on '%s' failed") % _string);
121         }
122     }
123 
124     /// Destructor.
125     ~impl(void)
126     {
127     }
128 };
129 
130 
131 /// Constructor.
132 ///
133 /// \param pimpl Constructed implementation of the object.
134 text::regex_matches::regex_matches(std::shared_ptr< impl > pimpl) :
135     _pimpl(pimpl)
136 {
137 }
138 
139 
140 /// Destructor.
141 text::regex_matches::~regex_matches(void)
142 {
143 }
144 
145 
146 /// Returns the number of matches in this object.
147 ///
148 /// Note that this does not correspond to the number of groups provided at
149 /// construction time.  The returned value here accounts for only the returned
150 /// valid matches.
151 ///
152 /// \return Number of matches, including the full match.
153 std::size_t
154 text::regex_matches::count(void) const
155 {
156     std::size_t total = 0;
157     if (_pimpl->_matches.get() != NULL) {
158         for (std::size_t i = 0; i < _pimpl->_nmatches; ++i) {
159             if (_pimpl->_matches[i].rm_so != -1)
160                 ++total;
161         }
162         INV(total <= _pimpl->_nmatches);
163     }
164     return total;
165 }
166 
167 
168 /// Gets a match.
169 ///
170 /// \param index Number of the match to get.  Index 0 always contains the match
171 ///     of the whole regex.
172 ///
173 /// \pre There regex must have matched the input string.
174 /// \pre index must be lower than count().
175 ///
176 /// \return The textual match.
177 std::string
178 text::regex_matches::get(const std::size_t index) const
179 {
180     PRE(*this);
181     PRE(index < count());
182 
183     const ::regmatch_t* match = &_pimpl->_matches[index];
184 
185     return std::string(_pimpl->_string.c_str() + match->rm_so,
186                        match->rm_eo - match->rm_so);
187 }
188 
189 
190 /// Checks if there are any matches.
191 ///
192 /// \return True if the object contains one or more matches; false otherwise.
193 text::regex_matches::operator bool(void) const
194 {
195     return _pimpl->_matches.get() != NULL;
196 }
197 
198 
199 /// Internal implementation for regex.
200 struct utils::text::regex::impl : utils::noncopyable {
201     /// Native regular expression representation.
202     ::regex_t _preg;
203 
204     /// Number of capture groups in the regular expression.  This is an upper
205     /// bound and does NOT include the default full string match.
206     std::size_t _ngroups;
207 
208     /// Constructor.
209     ///
210     /// This compiles the given regular expression.
211     ///
212     /// \param regex_ The regular expression to compile.
213     /// \param ngroups Number of capture groups in the regular expression.  This
214     ///     is an upper bound and does NOT include the default full string
215     ///     match.
216     /// \param ignore_case Whether to ignore case during matching.
217     ///
218     /// \throw regex_error If the call to regcomp(3) fails.
219     impl(const std::string& regex_, const std::size_t ngroups,
220          const bool ignore_case) :
221         _ngroups(ngroups)
222     {
223         const int flags = REG_EXTENDED | (ignore_case ? REG_ICASE : 0);
224         const int error = ::regcomp(&_preg, regex_.c_str(), flags);
225         if (error != 0)
226             throw_regex_error(error, &_preg, F("regcomp on '%s' failed")
227                               % regex_);
228     }
229 
230     /// Destructor.
231     ~impl(void)
232     {
233         ::regfree(&_preg);
234     }
235 };
236 
237 
238 /// Constructor.
239 ///
240 /// \param pimpl Constructed implementation of the object.
241 text::regex::regex(std::shared_ptr< impl > pimpl) : _pimpl(pimpl)
242 {
243 }
244 
245 
246 /// Destructor.
247 text::regex::~regex(void)
248 {
249 }
250 
251 
252 /// Compiles a new regular expression.
253 ///
254 /// \param regex_ The regular expression to compile.
255 /// \param ngroups Number of capture groups in the regular expression.  This is
256 ///     an upper bound and does NOT include the default full string match.
257 /// \param ignore_case Whether to ignore case during matching.
258 ///
259 /// \return A new regular expression, ready to match strings.
260 ///
261 /// \throw regex_error If the regular expression is invalid and cannot be
262 ///     compiled.
263 text::regex
264 text::regex::compile(const std::string& regex_, const std::size_t ngroups,
265                      const bool ignore_case)
266 {
267     return regex(std::shared_ptr< impl >(new impl(regex_, ngroups,
268                                                   ignore_case)));
269 }
270 
271 
272 /// Matches the regular expression against a string.
273 ///
274 /// \param str String to match the regular expression against.
275 ///
276 /// \return A new regex_matches object with the results of the match.
277 text::regex_matches
278 text::regex::match(const std::string& str) const
279 {
280     std::shared_ptr< regex_matches::impl > pimpl(new regex_matches::impl(
281         &_pimpl->_preg, str, _pimpl->_ngroups));
282     return regex_matches(pimpl);
283 }
284 
285 
286 /// Compiles and matches a regular expression once.
287 ///
288 /// This is syntactic sugar to simplify the instantiation of a new regex object
289 /// and its subsequent match on a string.
290 ///
291 /// \param regex_ The regular expression to compile and match.
292 /// \param str String to match the regular expression against.
293 /// \param ngroups Number of capture groups in the regular expression.
294 /// \param ignore_case Whether to ignore case during matching.
295 ///
296 /// \return A new regex_matches object with the results of the match.
297 text::regex_matches
298 text::match_regex(const std::string& regex_, const std::string& str,
299                   const std::size_t ngroups, const bool ignore_case)
300 {
301     return regex::compile(regex_, ngroups, ignore_case).match(str);
302 }
303