1 //===-- Regex.cpp - Regular Expression matcher implementation -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a POSIX regular expression matcher. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/Regex.h" 14 #include "llvm/ADT/SmallVector.h" 15 #include "llvm/ADT/StringRef.h" 16 #include "llvm/ADT/Twine.h" 17 #include <cassert> 18 #include <string> 19 20 // Important this comes last because it defines "_REGEX_H_". At least on 21 // Darwin, if included before any header that (transitively) includes 22 // xlocale.h, this will cause trouble, because of missing regex-related types. 23 #include "regex_impl.h" 24 25 using namespace llvm; 26 27 Regex::Regex() : preg(nullptr), error(REG_BADPAT) {} 28 29 Regex::Regex(StringRef regex, RegexFlags Flags) { 30 unsigned flags = 0; 31 preg = new llvm_regex(); 32 preg->re_endp = regex.end(); 33 if (Flags & IgnoreCase) 34 flags |= REG_ICASE; 35 if (Flags & Newline) 36 flags |= REG_NEWLINE; 37 if (!(Flags & BasicRegex)) 38 flags |= REG_EXTENDED; 39 error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); 40 } 41 42 Regex::Regex(StringRef regex, unsigned Flags) 43 : Regex(regex, static_cast<RegexFlags>(Flags)) {} 44 45 Regex::Regex(Regex &®ex) { 46 preg = regex.preg; 47 error = regex.error; 48 regex.preg = nullptr; 49 regex.error = REG_BADPAT; 50 } 51 52 Regex::~Regex() { 53 if (preg) { 54 llvm_regfree(preg); 55 delete preg; 56 } 57 } 58 59 namespace { 60 61 /// Utility to convert a regex error code into a human-readable string. 62 void RegexErrorToString(int error, struct llvm_regex *preg, 63 std::string &Error) { 64 size_t len = llvm_regerror(error, preg, nullptr, 0); 65 66 Error.resize(len - 1); 67 llvm_regerror(error, preg, &Error[0], len); 68 } 69 70 } // namespace 71 72 bool Regex::isValid(std::string &Error) const { 73 if (!error) 74 return true; 75 76 RegexErrorToString(error, preg, Error); 77 return false; 78 } 79 80 /// getNumMatches - In a valid regex, return the number of parenthesized 81 /// matches it contains. 82 unsigned Regex::getNumMatches() const { 83 return preg->re_nsub; 84 } 85 86 bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches, 87 std::string *Error) const { 88 // Reset error, if given. 89 if (Error && !Error->empty()) 90 *Error = ""; 91 92 // Check if the regex itself didn't successfully compile. 93 if (Error ? !isValid(*Error) : !isValid()) 94 return false; 95 96 unsigned nmatch = Matches ? preg->re_nsub+1 : 0; 97 98 // pmatch needs to have at least one element. 99 SmallVector<llvm_regmatch_t, 8> pm; 100 pm.resize(nmatch > 0 ? nmatch : 1); 101 pm[0].rm_so = 0; 102 pm[0].rm_eo = String.size(); 103 104 int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND); 105 106 // Failure to match is not an error, it's just a normal return value. 107 // Any other error code is considered abnormal, and is logged in the Error. 108 if (rc == REG_NOMATCH) 109 return false; 110 if (rc != 0) { 111 if (Error) 112 RegexErrorToString(error, preg, *Error); 113 return false; 114 } 115 116 // There was a match. 117 118 if (Matches) { // match position requested 119 Matches->clear(); 120 121 for (unsigned i = 0; i != nmatch; ++i) { 122 if (pm[i].rm_so == -1) { 123 // this group didn't match 124 Matches->push_back(StringRef()); 125 continue; 126 } 127 assert(pm[i].rm_eo >= pm[i].rm_so); 128 Matches->push_back(StringRef(String.data()+pm[i].rm_so, 129 pm[i].rm_eo-pm[i].rm_so)); 130 } 131 } 132 133 return true; 134 } 135 136 std::string Regex::sub(StringRef Repl, StringRef String, 137 std::string *Error) const { 138 SmallVector<StringRef, 8> Matches; 139 140 // Return the input if there was no match. 141 if (!match(String, &Matches, Error)) 142 return std::string(String); 143 144 // Otherwise splice in the replacement string, starting with the prefix before 145 // the match. 146 std::string Res(String.begin(), Matches[0].begin()); 147 148 // Then the replacement string, honoring possible substitutions. 149 while (!Repl.empty()) { 150 // Skip to the next escape. 151 std::pair<StringRef, StringRef> Split = Repl.split('\\'); 152 153 // Add the skipped substring. 154 Res += Split.first; 155 156 // Check for terminimation and trailing backslash. 157 if (Split.second.empty()) { 158 if (Repl.size() != Split.first.size() && 159 Error && Error->empty()) 160 *Error = "replacement string contained trailing backslash"; 161 break; 162 } 163 164 // Otherwise update the replacement string and interpret escapes. 165 Repl = Split.second; 166 167 // FIXME: We should have a StringExtras function for mapping C99 escapes. 168 switch (Repl[0]) { 169 // Treat all unrecognized characters as self-quoting. 170 default: 171 Res += Repl[0]; 172 Repl = Repl.substr(1); 173 break; 174 175 // Single character escapes. 176 case 't': 177 Res += '\t'; 178 Repl = Repl.substr(1); 179 break; 180 case 'n': 181 Res += '\n'; 182 Repl = Repl.substr(1); 183 break; 184 185 // Decimal escapes are backreferences. 186 case '0': case '1': case '2': case '3': case '4': 187 case '5': case '6': case '7': case '8': case '9': { 188 // Extract the backreference number. 189 StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789")); 190 Repl = Repl.substr(Ref.size()); 191 192 unsigned RefValue; 193 if (!Ref.getAsInteger(10, RefValue) && 194 RefValue < Matches.size()) 195 Res += Matches[RefValue]; 196 else if (Error && Error->empty()) 197 *Error = ("invalid backreference string '" + Twine(Ref) + "'").str(); 198 break; 199 } 200 } 201 } 202 203 // And finally the suffix. 204 Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); 205 206 return Res; 207 } 208 209 // These are the special characters matched in functions like "p_ere_exp". 210 static const char RegexMetachars[] = "()^$|*+?.[]\\{}"; 211 212 bool Regex::isLiteralERE(StringRef Str) { 213 // Check for regex metacharacters. This list was derived from our regex 214 // implementation in regcomp.c and double checked against the POSIX extended 215 // regular expression specification. 216 return Str.find_first_of(RegexMetachars) == StringRef::npos; 217 } 218 219 std::string Regex::escape(StringRef String) { 220 std::string RegexStr; 221 for (char C : String) { 222 if (strchr(RegexMetachars, C)) 223 RegexStr += '\\'; 224 RegexStr += C; 225 } 226 227 return RegexStr; 228 } 229