1 //===-- Regex.cpp - Regular Expression matcher implementation -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a POSIX regular expression matcher. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/Regex.h" 14 #include "llvm/ADT/SmallVector.h" 15 #include "llvm/ADT/StringRef.h" 16 #include "llvm/ADT/Twine.h" 17 #include <string> 18 19 // Important this comes last because it defines "_REGEX_H_". At least on 20 // Darwin, if included before any header that (transitively) includes 21 // xlocale.h, this will cause trouble, because of missing regex-related types. 22 #include "regex_impl.h" 23 24 using namespace llvm; 25 26 Regex::Regex() : preg(nullptr), error(REG_BADPAT) {} 27 28 Regex::Regex(StringRef regex, unsigned Flags) { 29 unsigned flags = 0; 30 preg = new llvm_regex(); 31 preg->re_endp = regex.end(); 32 if (Flags & IgnoreCase) 33 flags |= REG_ICASE; 34 if (Flags & Newline) 35 flags |= REG_NEWLINE; 36 if (!(Flags & BasicRegex)) 37 flags |= REG_EXTENDED; 38 error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); 39 } 40 41 Regex::Regex(Regex &®ex) { 42 preg = regex.preg; 43 error = regex.error; 44 regex.preg = nullptr; 45 regex.error = REG_BADPAT; 46 } 47 48 Regex::~Regex() { 49 if (preg) { 50 llvm_regfree(preg); 51 delete preg; 52 } 53 } 54 55 namespace { 56 57 /// Utility to convert a regex error code into a human-readable string. 58 void RegexErrorToString(int error, struct llvm_regex *preg, 59 std::string &Error) { 60 size_t len = llvm_regerror(error, preg, nullptr, 0); 61 62 Error.resize(len - 1); 63 llvm_regerror(error, preg, &Error[0], len); 64 } 65 66 } // namespace 67 68 bool Regex::isValid(std::string &Error) const { 69 if (!error) 70 return true; 71 72 RegexErrorToString(error, preg, Error); 73 return false; 74 } 75 76 /// getNumMatches - In a valid regex, return the number of parenthesized 77 /// matches it contains. 78 unsigned Regex::getNumMatches() const { 79 return preg->re_nsub; 80 } 81 82 bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches, 83 std::string *Error) const { 84 // Reset error, if given. 85 if (Error && !Error->empty()) 86 *Error = ""; 87 88 // Check if the regex itself didn't successfully compile. 89 if (Error ? !isValid(*Error) : !isValid()) 90 return false; 91 92 unsigned nmatch = Matches ? preg->re_nsub+1 : 0; 93 94 // pmatch needs to have at least one element. 95 SmallVector<llvm_regmatch_t, 8> pm; 96 pm.resize(nmatch > 0 ? nmatch : 1); 97 pm[0].rm_so = 0; 98 pm[0].rm_eo = String.size(); 99 100 int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND); 101 102 // Failure to match is not an error, it's just a normal return value. 103 // Any other error code is considered abnormal, and is logged in the Error. 104 if (rc == REG_NOMATCH) 105 return false; 106 if (rc != 0) { 107 if (Error) 108 RegexErrorToString(error, preg, *Error); 109 return false; 110 } 111 112 // There was a match. 113 114 if (Matches) { // match position requested 115 Matches->clear(); 116 117 for (unsigned i = 0; i != nmatch; ++i) { 118 if (pm[i].rm_so == -1) { 119 // this group didn't match 120 Matches->push_back(StringRef()); 121 continue; 122 } 123 assert(pm[i].rm_eo >= pm[i].rm_so); 124 Matches->push_back(StringRef(String.data()+pm[i].rm_so, 125 pm[i].rm_eo-pm[i].rm_so)); 126 } 127 } 128 129 return true; 130 } 131 132 std::string Regex::sub(StringRef Repl, StringRef String, 133 std::string *Error) const { 134 SmallVector<StringRef, 8> Matches; 135 136 // Return the input if there was no match. 137 if (!match(String, &Matches, Error)) 138 return String; 139 140 // Otherwise splice in the replacement string, starting with the prefix before 141 // the match. 142 std::string Res(String.begin(), Matches[0].begin()); 143 144 // Then the replacement string, honoring possible substitutions. 145 while (!Repl.empty()) { 146 // Skip to the next escape. 147 std::pair<StringRef, StringRef> Split = Repl.split('\\'); 148 149 // Add the skipped substring. 150 Res += Split.first; 151 152 // Check for terminimation and trailing backslash. 153 if (Split.second.empty()) { 154 if (Repl.size() != Split.first.size() && 155 Error && Error->empty()) 156 *Error = "replacement string contained trailing backslash"; 157 break; 158 } 159 160 // Otherwise update the replacement string and interpret escapes. 161 Repl = Split.second; 162 163 // FIXME: We should have a StringExtras function for mapping C99 escapes. 164 switch (Repl[0]) { 165 // Treat all unrecognized characters as self-quoting. 166 default: 167 Res += Repl[0]; 168 Repl = Repl.substr(1); 169 break; 170 171 // Single character escapes. 172 case 't': 173 Res += '\t'; 174 Repl = Repl.substr(1); 175 break; 176 case 'n': 177 Res += '\n'; 178 Repl = Repl.substr(1); 179 break; 180 181 // Decimal escapes are backreferences. 182 case '0': case '1': case '2': case '3': case '4': 183 case '5': case '6': case '7': case '8': case '9': { 184 // Extract the backreference number. 185 StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789")); 186 Repl = Repl.substr(Ref.size()); 187 188 unsigned RefValue; 189 if (!Ref.getAsInteger(10, RefValue) && 190 RefValue < Matches.size()) 191 Res += Matches[RefValue]; 192 else if (Error && Error->empty()) 193 *Error = ("invalid backreference string '" + Twine(Ref) + "'").str(); 194 break; 195 } 196 } 197 } 198 199 // And finally the suffix. 200 Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); 201 202 return Res; 203 } 204 205 // These are the special characters matched in functions like "p_ere_exp". 206 static const char RegexMetachars[] = "()^$|*+?.[]\\{}"; 207 208 bool Regex::isLiteralERE(StringRef Str) { 209 // Check for regex metacharacters. This list was derived from our regex 210 // implementation in regcomp.c and double checked against the POSIX extended 211 // regular expression specification. 212 return Str.find_first_of(RegexMetachars) == StringRef::npos; 213 } 214 215 std::string Regex::escape(StringRef String) { 216 std::string RegexStr; 217 for (unsigned i = 0, e = String.size(); i != e; ++i) { 218 if (strchr(RegexMetachars, String[i])) 219 RegexStr += '\\'; 220 RegexStr += String[i]; 221 } 222 223 return RegexStr; 224 } 225