1 //===-- Regex.cpp - Regular Expression matcher implementation -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a POSIX regular expression matcher. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/Regex.h" 14 #include "llvm/ADT/SmallVector.h" 15 #include "llvm/ADT/StringRef.h" 16 #include "llvm/ADT/Twine.h" 17 #include "regex_impl.h" 18 19 #include <cassert> 20 #include <string> 21 22 using namespace llvm; 23 24 Regex::Regex() : preg(nullptr), error(REG_BADPAT) {} 25 26 Regex::Regex(StringRef regex, RegexFlags Flags) { 27 unsigned flags = 0; 28 preg = new llvm_regex(); 29 preg->re_endp = regex.end(); 30 if (Flags & IgnoreCase) 31 flags |= REG_ICASE; 32 if (Flags & Newline) 33 flags |= REG_NEWLINE; 34 if (!(Flags & BasicRegex)) 35 flags |= REG_EXTENDED; 36 error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); 37 } 38 39 Regex::Regex(StringRef regex, unsigned Flags) 40 : Regex(regex, static_cast<RegexFlags>(Flags)) {} 41 42 Regex::Regex(Regex &®ex) { 43 preg = regex.preg; 44 error = regex.error; 45 regex.preg = nullptr; 46 regex.error = REG_BADPAT; 47 } 48 49 Regex::~Regex() { 50 if (preg) { 51 llvm_regfree(preg); 52 delete preg; 53 } 54 } 55 56 namespace { 57 58 /// Utility to convert a regex error code into a human-readable string. 59 void RegexErrorToString(int error, struct llvm_regex *preg, 60 std::string &Error) { 61 size_t len = llvm_regerror(error, preg, nullptr, 0); 62 63 Error.resize(len - 1); 64 llvm_regerror(error, preg, &Error[0], len); 65 } 66 67 } // namespace 68 69 bool Regex::isValid(std::string &Error) const { 70 if (!error) 71 return true; 72 73 RegexErrorToString(error, preg, Error); 74 return false; 75 } 76 77 /// getNumMatches - In a valid regex, return the number of parenthesized 78 /// matches it contains. 79 unsigned Regex::getNumMatches() const { 80 return preg->re_nsub; 81 } 82 83 bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches, 84 std::string *Error) const { 85 // Reset error, if given. 86 if (Error && !Error->empty()) 87 *Error = ""; 88 89 // Check if the regex itself didn't successfully compile. 90 if (Error ? !isValid(*Error) : !isValid()) 91 return false; 92 93 unsigned nmatch = Matches ? preg->re_nsub+1 : 0; 94 95 // Update null string to empty string. 96 if (String.data() == nullptr) 97 String = ""; 98 99 // pmatch needs to have at least one element. 100 SmallVector<llvm_regmatch_t, 8> pm; 101 pm.resize(nmatch > 0 ? nmatch : 1); 102 pm[0].rm_so = 0; 103 pm[0].rm_eo = String.size(); 104 105 int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND); 106 107 // Failure to match is not an error, it's just a normal return value. 108 // Any other error code is considered abnormal, and is logged in the Error. 109 if (rc == REG_NOMATCH) 110 return false; 111 if (rc != 0) { 112 if (Error) 113 RegexErrorToString(error, preg, *Error); 114 return false; 115 } 116 117 // There was a match. 118 119 if (Matches) { // match position requested 120 Matches->clear(); 121 122 for (unsigned i = 0; i != nmatch; ++i) { 123 if (pm[i].rm_so == -1) { 124 // this group didn't match 125 Matches->push_back(StringRef()); 126 continue; 127 } 128 assert(pm[i].rm_eo >= pm[i].rm_so); 129 Matches->push_back(StringRef(String.data()+pm[i].rm_so, 130 pm[i].rm_eo-pm[i].rm_so)); 131 } 132 } 133 134 return true; 135 } 136 137 std::string Regex::sub(StringRef Repl, StringRef String, 138 std::string *Error) const { 139 SmallVector<StringRef, 8> Matches; 140 141 // Return the input if there was no match. 142 if (!match(String, &Matches, Error)) 143 return std::string(String); 144 145 // Otherwise splice in the replacement string, starting with the prefix before 146 // the match. 147 std::string Res(String.begin(), Matches[0].begin()); 148 149 // Then the replacement string, honoring possible substitutions. 150 while (!Repl.empty()) { 151 // Skip to the next escape. 152 std::pair<StringRef, StringRef> Split = Repl.split('\\'); 153 154 // Add the skipped substring. 155 Res += Split.first; 156 157 // Check for terminimation and trailing backslash. 158 if (Split.second.empty()) { 159 if (Repl.size() != Split.first.size() && 160 Error && Error->empty()) 161 *Error = "replacement string contained trailing backslash"; 162 break; 163 } 164 165 // Otherwise update the replacement string and interpret escapes. 166 Repl = Split.second; 167 168 // FIXME: We should have a StringExtras function for mapping C99 escapes. 169 switch (Repl[0]) { 170 171 // Backreference with the "\g<ref>" syntax 172 case 'g': 173 if (Repl.size() >= 4 && Repl[1] == '<') { 174 size_t End = Repl.find('>'); 175 StringRef Ref = Repl.slice(2, End); 176 unsigned RefValue; 177 if (End != StringRef::npos && !Ref.getAsInteger(10, RefValue)) { 178 Repl = Repl.substr(End + 1); 179 if (RefValue < Matches.size()) 180 Res += Matches[RefValue]; 181 else if (Error && Error->empty()) 182 *Error = 183 ("invalid backreference string 'g<" + Twine(Ref) + ">'").str(); 184 break; 185 } 186 } 187 [[fallthrough]]; 188 189 // Treat all unrecognized characters as self-quoting. 190 default: 191 Res += Repl[0]; 192 Repl = Repl.substr(1); 193 break; 194 195 // Single character escapes. 196 case 't': 197 Res += '\t'; 198 Repl = Repl.substr(1); 199 break; 200 case 'n': 201 Res += '\n'; 202 Repl = Repl.substr(1); 203 break; 204 205 // Decimal escapes are backreferences. 206 case '0': case '1': case '2': case '3': case '4': 207 case '5': case '6': case '7': case '8': case '9': { 208 // Extract the backreference number. 209 StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789")); 210 Repl = Repl.substr(Ref.size()); 211 212 unsigned RefValue; 213 if (!Ref.getAsInteger(10, RefValue) && 214 RefValue < Matches.size()) 215 Res += Matches[RefValue]; 216 else if (Error && Error->empty()) 217 *Error = ("invalid backreference string '" + Twine(Ref) + "'").str(); 218 break; 219 } 220 } 221 } 222 223 // And finally the suffix. 224 Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); 225 226 return Res; 227 } 228 229 // These are the special characters matched in functions like "p_ere_exp". 230 static const char RegexMetachars[] = "()^$|*+?.[]\\{}"; 231 232 bool Regex::isLiteralERE(StringRef Str) { 233 // Check for regex metacharacters. This list was derived from our regex 234 // implementation in regcomp.c and double checked against the POSIX extended 235 // regular expression specification. 236 return Str.find_first_of(RegexMetachars) == StringRef::npos; 237 } 238 239 std::string Regex::escape(StringRef String) { 240 std::string RegexStr; 241 for (char C : String) { 242 if (strchr(RegexMetachars, C)) 243 RegexStr += '\\'; 244 RegexStr += C; 245 } 246 247 return RegexStr; 248 } 249