1 // Copyright 2014 The Kyua Authors.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of Google Inc. nor the names of its contributors
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 //
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29 #include "utils/text/regex.hpp"
30
31 extern "C" {
32 #include <sys/types.h>
33
34 #include <regex.h>
35 }
36
37 #include "utils/auto_array.ipp"
38 #include "utils/defs.hpp"
39 #include "utils/format/macros.hpp"
40 #include "utils/noncopyable.hpp"
41 #include "utils/sanity.hpp"
42 #include "utils/text/exceptions.hpp"
43
44 namespace text = utils::text;
45
46
47 namespace {
48
49
50 static void throw_regex_error(const int, const ::regex_t*, const std::string&)
51 UTILS_NORETURN;
52
53
54 /// Constructs and raises a regex_error.
55 ///
56 /// \param error The error code returned by regcomp(3) or regexec(3).
57 /// \param preg The native regex object that caused this error.
58 /// \param prefix Error message prefix string.
59 ///
60 /// \throw regex_error The constructed exception.
61 static void
throw_regex_error(const int error,const::regex_t * preg,const std::string & prefix)62 throw_regex_error(const int error, const ::regex_t* preg,
63 const std::string& prefix)
64 {
65 char buffer[1024];
66
67 // TODO(jmmv): Would be nice to handle the case where the message does
68 // not fit in the temporary buffer.
69 (void)::regerror(error, preg, buffer, sizeof(buffer));
70
71 throw text::regex_error(F("%s: %s") % prefix % buffer);
72 }
73
74
75 } // anonymous namespace
76
77
78 /// Internal implementation for regex_matches.
79 struct utils::text::regex_matches::impl : utils::noncopyable {
80 /// String on which we are matching.
81 ///
82 /// In theory, we could take a reference here instead of a copy, and make
83 /// it a requirement for the caller to ensure that the lifecycle of the
84 /// input string outlasts the lifecycle of the regex_matches. However, that
85 /// contract is very easy to break with hardcoded strings (as we do in
86 /// tests). Just go for the safer case here.
87 const std::string _string;
88
89 /// Maximum number of matching groups we expect, including the full match.
90 ///
91 /// In other words, this is the size of the _matches array.
92 const std::size_t _nmatches;
93
94 /// Native regular expression match representation.
95 utils::auto_array< ::regmatch_t > _matches;
96
97 /// Constructor.
98 ///
99 /// This executes the regex on the given string and sets up the internal
100 /// class state based on the results.
101 ///
102 /// \param preg The native regex object.
103 /// \param str The string on which to execute the regex.
104 /// \param ngroups Number of capture groups in the regex. This is an upper
105 /// bound and may be greater than the actual matches.
106 ///
107 /// \throw regex_error If the call to regexec(3) fails.
implutils::text::regex_matches::impl108 impl(const ::regex_t* preg, const std::string& str,
109 const std::size_t ngroups) :
110 _string(str),
111 _nmatches(ngroups + 1),
112 _matches(new ::regmatch_t[_nmatches])
113 {
114 const int error = ::regexec(preg, _string.c_str(), _nmatches,
115 _matches.get(), 0);
116 if (error == REG_NOMATCH) {
117 _matches.reset(NULL);
118 } else if (error != 0) {
119 throw_regex_error(error, preg,
120 F("regexec on '%s' failed") % _string);
121 }
122 }
123
124 /// Destructor.
~implutils::text::regex_matches::impl125 ~impl(void)
126 {
127 }
128 };
129
130
131 /// Constructor.
132 ///
133 /// \param pimpl Constructed implementation of the object.
regex_matches(std::shared_ptr<impl> pimpl)134 text::regex_matches::regex_matches(std::shared_ptr< impl > pimpl) :
135 _pimpl(pimpl)
136 {
137 }
138
139
140 /// Destructor.
~regex_matches(void)141 text::regex_matches::~regex_matches(void)
142 {
143 }
144
145
146 /// Returns the number of matches in this object.
147 ///
148 /// Note that this does not correspond to the number of groups provided at
149 /// construction time. The returned value here accounts for only the returned
150 /// valid matches.
151 ///
152 /// \return Number of matches, including the full match.
153 std::size_t
count(void) const154 text::regex_matches::count(void) const
155 {
156 std::size_t total = 0;
157 if (_pimpl->_matches.get() != NULL) {
158 for (std::size_t i = 0; i < _pimpl->_nmatches; ++i) {
159 if (_pimpl->_matches[i].rm_so != -1)
160 ++total;
161 }
162 INV(total <= _pimpl->_nmatches);
163 }
164 return total;
165 }
166
167
168 /// Gets a match.
169 ///
170 /// \param index Number of the match to get. Index 0 always contains the match
171 /// of the whole regex.
172 ///
173 /// \pre There regex must have matched the input string.
174 /// \pre index must be lower than count().
175 ///
176 /// \return The textual match.
177 std::string
get(const std::size_t index) const178 text::regex_matches::get(const std::size_t index) const
179 {
180 PRE(*this);
181 PRE(index < count());
182
183 const ::regmatch_t* match = &_pimpl->_matches[index];
184
185 return std::string(_pimpl->_string.c_str() + match->rm_so,
186 match->rm_eo - match->rm_so);
187 }
188
189
190 /// Checks if there are any matches.
191 ///
192 /// \return True if the object contains one or more matches; false otherwise.
operator bool(void) const193 text::regex_matches::operator bool(void) const
194 {
195 return _pimpl->_matches.get() != NULL;
196 }
197
198
199 /// Internal implementation for regex.
200 struct utils::text::regex::impl : utils::noncopyable {
201 /// Native regular expression representation.
202 ::regex_t _preg;
203
204 /// Number of capture groups in the regular expression. This is an upper
205 /// bound and does NOT include the default full string match.
206 std::size_t _ngroups;
207
208 /// Constructor.
209 ///
210 /// This compiles the given regular expression.
211 ///
212 /// \param regex_ The regular expression to compile.
213 /// \param ngroups Number of capture groups in the regular expression. This
214 /// is an upper bound and does NOT include the default full string
215 /// match.
216 /// \param ignore_case Whether to ignore case during matching.
217 ///
218 /// \throw regex_error If the call to regcomp(3) fails.
implutils::text::regex::impl219 impl(const std::string& regex_, const std::size_t ngroups,
220 const bool ignore_case) :
221 _ngroups(ngroups)
222 {
223 const int flags = REG_EXTENDED | (ignore_case ? REG_ICASE : 0);
224 const int error = ::regcomp(&_preg, regex_.c_str(), flags);
225 if (error != 0)
226 throw_regex_error(error, &_preg, F("regcomp on '%s' failed")
227 % regex_);
228 }
229
230 /// Destructor.
~implutils::text::regex::impl231 ~impl(void)
232 {
233 ::regfree(&_preg);
234 }
235 };
236
237
238 /// Constructor.
239 ///
240 /// \param pimpl Constructed implementation of the object.
regex(std::shared_ptr<impl> pimpl)241 text::regex::regex(std::shared_ptr< impl > pimpl) : _pimpl(pimpl)
242 {
243 }
244
245
246 /// Destructor.
~regex(void)247 text::regex::~regex(void)
248 {
249 }
250
251
252 /// Compiles a new regular expression.
253 ///
254 /// \param regex_ The regular expression to compile.
255 /// \param ngroups Number of capture groups in the regular expression. This is
256 /// an upper bound and does NOT include the default full string match.
257 /// \param ignore_case Whether to ignore case during matching.
258 ///
259 /// \return A new regular expression, ready to match strings.
260 ///
261 /// \throw regex_error If the regular expression is invalid and cannot be
262 /// compiled.
263 text::regex
compile(const std::string & regex_,const std::size_t ngroups,const bool ignore_case)264 text::regex::compile(const std::string& regex_, const std::size_t ngroups,
265 const bool ignore_case)
266 {
267 return regex(std::shared_ptr< impl >(new impl(regex_, ngroups,
268 ignore_case)));
269 }
270
271
272 /// Matches the regular expression against a string.
273 ///
274 /// \param str String to match the regular expression against.
275 ///
276 /// \return A new regex_matches object with the results of the match.
277 text::regex_matches
match(const std::string & str) const278 text::regex::match(const std::string& str) const
279 {
280 std::shared_ptr< regex_matches::impl > pimpl(new regex_matches::impl(
281 &_pimpl->_preg, str, _pimpl->_ngroups));
282 return regex_matches(pimpl);
283 }
284
285
286 /// Compiles and matches a regular expression once.
287 ///
288 /// This is syntactic sugar to simplify the instantiation of a new regex object
289 /// and its subsequent match on a string.
290 ///
291 /// \param regex_ The regular expression to compile and match.
292 /// \param str String to match the regular expression against.
293 /// \param ngroups Number of capture groups in the regular expression.
294 /// \param ignore_case Whether to ignore case during matching.
295 ///
296 /// \return A new regex_matches object with the results of the match.
297 text::regex_matches
match_regex(const std::string & regex_,const std::string & str,const std::size_t ngroups,const bool ignore_case)298 text::match_regex(const std::string& regex_, const std::string& str,
299 const std::size_t ngroups, const bool ignore_case)
300 {
301 return regex::compile(regex_, ngroups, ignore_case).match(str);
302 }
303