1 // Copyright 2014 The Kyua Authors. 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions are 6 // met: 7 // 8 // * Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // * Redistributions in binary form must reproduce the above copyright 11 // notice, this list of conditions and the following disclaimer in the 12 // documentation and/or other materials provided with the distribution. 13 // * Neither the name of Google Inc. nor the names of its contributors 14 // may be used to endorse or promote products derived from this software 15 // without specific prior written permission. 16 // 17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29 #include "utils/text/regex.hpp" 30 31 extern "C" { 32 #include <sys/types.h> 33 34 #include <regex.h> 35 } 36 37 #include "utils/auto_array.ipp" 38 #include "utils/defs.hpp" 39 #include "utils/format/macros.hpp" 40 #include "utils/noncopyable.hpp" 41 #include "utils/sanity.hpp" 42 #include "utils/text/exceptions.hpp" 43 44 namespace text = utils::text; 45 46 47 namespace { 48 49 50 static void throw_regex_error(const int, const ::regex_t*, const std::string&) 51 UTILS_NORETURN; 52 53 54 /// Constructs and raises a regex_error. 55 /// 56 /// \param error The error code returned by regcomp(3) or regexec(3). 57 /// \param preg The native regex object that caused this error. 58 /// \param prefix Error message prefix string. 59 /// 60 /// \throw regex_error The constructed exception. 61 static void 62 throw_regex_error(const int error, const ::regex_t* preg, 63 const std::string& prefix) 64 { 65 char buffer[1024]; 66 67 // TODO(jmmv): Would be nice to handle the case where the message does 68 // not fit in the temporary buffer. 69 (void)::regerror(error, preg, buffer, sizeof(buffer)); 70 71 throw text::regex_error(F("%s: %s") % prefix % buffer); 72 } 73 74 75 } // anonymous namespace 76 77 78 /// Internal implementation for regex_matches. 79 struct utils::text::regex_matches::impl : utils::noncopyable { 80 /// String on which we are matching. 81 /// 82 /// In theory, we could take a reference here instead of a copy, and make 83 /// it a requirement for the caller to ensure that the lifecycle of the 84 /// input string outlasts the lifecycle of the regex_matches. However, that 85 /// contract is very easy to break with hardcoded strings (as we do in 86 /// tests). Just go for the safer case here. 87 const std::string _string; 88 89 /// Maximum number of matching groups we expect, including the full match. 90 /// 91 /// In other words, this is the size of the _matches array. 92 const std::size_t _nmatches; 93 94 /// Native regular expression match representation. 95 utils::auto_array< ::regmatch_t > _matches; 96 97 /// Constructor. 98 /// 99 /// This executes the regex on the given string and sets up the internal 100 /// class state based on the results. 101 /// 102 /// \param preg The native regex object. 103 /// \param str The string on which to execute the regex. 104 /// \param ngroups Number of capture groups in the regex. This is an upper 105 /// bound and may be greater than the actual matches. 106 /// 107 /// \throw regex_error If the call to regexec(3) fails. 108 impl(const ::regex_t* preg, const std::string& str, 109 const std::size_t ngroups) : 110 _string(str), 111 _nmatches(ngroups + 1), 112 _matches(new ::regmatch_t[_nmatches]) 113 { 114 const int error = ::regexec(preg, _string.c_str(), _nmatches, 115 _matches.get(), 0); 116 if (error == REG_NOMATCH) { 117 _matches.reset(NULL); 118 } else if (error != 0) { 119 throw_regex_error(error, preg, 120 F("regexec on '%s' failed") % _string); 121 } 122 } 123 124 /// Destructor. 125 ~impl(void) 126 { 127 } 128 }; 129 130 131 /// Constructor. 132 /// 133 /// \param pimpl Constructed implementation of the object. 134 text::regex_matches::regex_matches(std::shared_ptr< impl > pimpl) : 135 _pimpl(pimpl) 136 { 137 } 138 139 140 /// Destructor. 141 text::regex_matches::~regex_matches(void) 142 { 143 } 144 145 146 /// Returns the number of matches in this object. 147 /// 148 /// Note that this does not correspond to the number of groups provided at 149 /// construction time. The returned value here accounts for only the returned 150 /// valid matches. 151 /// 152 /// \return Number of matches, including the full match. 153 std::size_t 154 text::regex_matches::count(void) const 155 { 156 std::size_t total = 0; 157 if (_pimpl->_matches.get() != NULL) { 158 for (std::size_t i = 0; i < _pimpl->_nmatches; ++i) { 159 if (_pimpl->_matches[i].rm_so != -1) 160 ++total; 161 } 162 INV(total <= _pimpl->_nmatches); 163 } 164 return total; 165 } 166 167 168 /// Gets a match. 169 /// 170 /// \param index Number of the match to get. Index 0 always contains the match 171 /// of the whole regex. 172 /// 173 /// \pre There regex must have matched the input string. 174 /// \pre index must be lower than count(). 175 /// 176 /// \return The textual match. 177 std::string 178 text::regex_matches::get(const std::size_t index) const 179 { 180 PRE(*this); 181 PRE(index < count()); 182 183 const ::regmatch_t* match = &_pimpl->_matches[index]; 184 185 return std::string(_pimpl->_string.c_str() + match->rm_so, 186 match->rm_eo - match->rm_so); 187 } 188 189 190 /// Checks if there are any matches. 191 /// 192 /// \return True if the object contains one or more matches; false otherwise. 193 text::regex_matches::operator bool(void) const 194 { 195 return _pimpl->_matches.get() != NULL; 196 } 197 198 199 /// Internal implementation for regex. 200 struct utils::text::regex::impl : utils::noncopyable { 201 /// Native regular expression representation. 202 ::regex_t _preg; 203 204 /// Number of capture groups in the regular expression. This is an upper 205 /// bound and does NOT include the default full string match. 206 std::size_t _ngroups; 207 208 /// Constructor. 209 /// 210 /// This compiles the given regular expression. 211 /// 212 /// \param regex_ The regular expression to compile. 213 /// \param ngroups Number of capture groups in the regular expression. This 214 /// is an upper bound and does NOT include the default full string 215 /// match. 216 /// \param ignore_case Whether to ignore case during matching. 217 /// 218 /// \throw regex_error If the call to regcomp(3) fails. 219 impl(const std::string& regex_, const std::size_t ngroups, 220 const bool ignore_case) : 221 _ngroups(ngroups) 222 { 223 const int flags = REG_EXTENDED | (ignore_case ? REG_ICASE : 0); 224 const int error = ::regcomp(&_preg, regex_.c_str(), flags); 225 if (error != 0) 226 throw_regex_error(error, &_preg, F("regcomp on '%s' failed") 227 % regex_); 228 } 229 230 /// Destructor. 231 ~impl(void) 232 { 233 ::regfree(&_preg); 234 } 235 }; 236 237 238 /// Constructor. 239 /// 240 /// \param pimpl Constructed implementation of the object. 241 text::regex::regex(std::shared_ptr< impl > pimpl) : _pimpl(pimpl) 242 { 243 } 244 245 246 /// Destructor. 247 text::regex::~regex(void) 248 { 249 } 250 251 252 /// Compiles a new regular expression. 253 /// 254 /// \param regex_ The regular expression to compile. 255 /// \param ngroups Number of capture groups in the regular expression. This is 256 /// an upper bound and does NOT include the default full string match. 257 /// \param ignore_case Whether to ignore case during matching. 258 /// 259 /// \return A new regular expression, ready to match strings. 260 /// 261 /// \throw regex_error If the regular expression is invalid and cannot be 262 /// compiled. 263 text::regex 264 text::regex::compile(const std::string& regex_, const std::size_t ngroups, 265 const bool ignore_case) 266 { 267 return regex(std::shared_ptr< impl >(new impl(regex_, ngroups, 268 ignore_case))); 269 } 270 271 272 /// Matches the regular expression against a string. 273 /// 274 /// \param str String to match the regular expression against. 275 /// 276 /// \return A new regex_matches object with the results of the match. 277 text::regex_matches 278 text::regex::match(const std::string& str) const 279 { 280 std::shared_ptr< regex_matches::impl > pimpl(new regex_matches::impl( 281 &_pimpl->_preg, str, _pimpl->_ngroups)); 282 return regex_matches(pimpl); 283 } 284 285 286 /// Compiles and matches a regular expression once. 287 /// 288 /// This is syntactic sugar to simplify the instantiation of a new regex object 289 /// and its subsequent match on a string. 290 /// 291 /// \param regex_ The regular expression to compile and match. 292 /// \param str String to match the regular expression against. 293 /// \param ngroups Number of capture groups in the regular expression. 294 /// \param ignore_case Whether to ignore case during matching. 295 /// 296 /// \return A new regex_matches object with the results of the match. 297 text::regex_matches 298 text::match_regex(const std::string& regex_, const std::string& str, 299 const std::size_t ngroups, const bool ignore_case) 300 { 301 return regex::compile(regex_, ngroups, ignore_case).match(str); 302 } 303