1 //===-- Regex.cpp - Regular Expression matcher implementation -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a POSIX regular expression matcher. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/Regex.h" 14 #include "llvm/ADT/SmallVector.h" 15 #include "llvm/ADT/StringRef.h" 16 #include "llvm/ADT/Twine.h" 17 #include "regex_impl.h" 18 19 #include <cassert> 20 #include <string> 21 22 using namespace llvm; 23 24 Regex::Regex() : preg(nullptr), error(REG_BADPAT) {} 25 26 Regex::Regex(StringRef regex, RegexFlags Flags) { 27 unsigned flags = 0; 28 preg = new llvm_regex(); 29 preg->re_endp = regex.end(); 30 if (Flags & IgnoreCase) 31 flags |= REG_ICASE; 32 if (Flags & Newline) 33 flags |= REG_NEWLINE; 34 if (!(Flags & BasicRegex)) 35 flags |= REG_EXTENDED; 36 error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); 37 } 38 39 Regex::Regex(StringRef regex, unsigned Flags) 40 : Regex(regex, static_cast<RegexFlags>(Flags)) {} 41 42 Regex::Regex(Regex &®ex) { 43 preg = regex.preg; 44 error = regex.error; 45 regex.preg = nullptr; 46 regex.error = REG_BADPAT; 47 } 48 49 Regex::~Regex() { 50 if (preg) { 51 llvm_regfree(preg); 52 delete preg; 53 } 54 } 55 56 namespace { 57 58 /// Utility to convert a regex error code into a human-readable string. 59 void RegexErrorToString(int error, struct llvm_regex *preg, 60 std::string &Error) { 61 size_t len = llvm_regerror(error, preg, nullptr, 0); 62 63 Error.resize(len - 1); 64 llvm_regerror(error, preg, &Error[0], len); 65 } 66 67 } // namespace 68 69 bool Regex::isValid(std::string &Error) const { 70 if (!error) 71 return true; 72 73 RegexErrorToString(error, preg, Error); 74 return false; 75 } 76 77 /// getNumMatches - In a valid regex, return the number of parenthesized 78 /// matches it contains. 79 unsigned Regex::getNumMatches() const { 80 return preg->re_nsub; 81 } 82 83 bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches, 84 std::string *Error) const { 85 // Reset error, if given. 86 if (Error && !Error->empty()) 87 *Error = ""; 88 89 // Check if the regex itself didn't successfully compile. 90 if (Error ? !isValid(*Error) : !isValid()) 91 return false; 92 93 unsigned nmatch = Matches ? preg->re_nsub+1 : 0; 94 95 // pmatch needs to have at least one element. 96 SmallVector<llvm_regmatch_t, 8> pm; 97 pm.resize(nmatch > 0 ? nmatch : 1); 98 pm[0].rm_so = 0; 99 pm[0].rm_eo = String.size(); 100 101 int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND); 102 103 // Failure to match is not an error, it's just a normal return value. 104 // Any other error code is considered abnormal, and is logged in the Error. 105 if (rc == REG_NOMATCH) 106 return false; 107 if (rc != 0) { 108 if (Error) 109 RegexErrorToString(error, preg, *Error); 110 return false; 111 } 112 113 // There was a match. 114 115 if (Matches) { // match position requested 116 Matches->clear(); 117 118 for (unsigned i = 0; i != nmatch; ++i) { 119 if (pm[i].rm_so == -1) { 120 // this group didn't match 121 Matches->push_back(StringRef()); 122 continue; 123 } 124 assert(pm[i].rm_eo >= pm[i].rm_so); 125 Matches->push_back(StringRef(String.data()+pm[i].rm_so, 126 pm[i].rm_eo-pm[i].rm_so)); 127 } 128 } 129 130 return true; 131 } 132 133 std::string Regex::sub(StringRef Repl, StringRef String, 134 std::string *Error) const { 135 SmallVector<StringRef, 8> Matches; 136 137 // Return the input if there was no match. 138 if (!match(String, &Matches, Error)) 139 return std::string(String); 140 141 // Otherwise splice in the replacement string, starting with the prefix before 142 // the match. 143 std::string Res(String.begin(), Matches[0].begin()); 144 145 // Then the replacement string, honoring possible substitutions. 146 while (!Repl.empty()) { 147 // Skip to the next escape. 148 std::pair<StringRef, StringRef> Split = Repl.split('\\'); 149 150 // Add the skipped substring. 151 Res += Split.first; 152 153 // Check for terminimation and trailing backslash. 154 if (Split.second.empty()) { 155 if (Repl.size() != Split.first.size() && 156 Error && Error->empty()) 157 *Error = "replacement string contained trailing backslash"; 158 break; 159 } 160 161 // Otherwise update the replacement string and interpret escapes. 162 Repl = Split.second; 163 164 // FIXME: We should have a StringExtras function for mapping C99 escapes. 165 switch (Repl[0]) { 166 // Treat all unrecognized characters as self-quoting. 167 default: 168 Res += Repl[0]; 169 Repl = Repl.substr(1); 170 break; 171 172 // Single character escapes. 173 case 't': 174 Res += '\t'; 175 Repl = Repl.substr(1); 176 break; 177 case 'n': 178 Res += '\n'; 179 Repl = Repl.substr(1); 180 break; 181 182 // Decimal escapes are backreferences. 183 case '0': case '1': case '2': case '3': case '4': 184 case '5': case '6': case '7': case '8': case '9': { 185 // Extract the backreference number. 186 StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789")); 187 Repl = Repl.substr(Ref.size()); 188 189 unsigned RefValue; 190 if (!Ref.getAsInteger(10, RefValue) && 191 RefValue < Matches.size()) 192 Res += Matches[RefValue]; 193 else if (Error && Error->empty()) 194 *Error = ("invalid backreference string '" + Twine(Ref) + "'").str(); 195 break; 196 } 197 } 198 } 199 200 // And finally the suffix. 201 Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); 202 203 return Res; 204 } 205 206 // These are the special characters matched in functions like "p_ere_exp". 207 static const char RegexMetachars[] = "()^$|*+?.[]\\{}"; 208 209 bool Regex::isLiteralERE(StringRef Str) { 210 // Check for regex metacharacters. This list was derived from our regex 211 // implementation in regcomp.c and double checked against the POSIX extended 212 // regular expression specification. 213 return Str.find_first_of(RegexMetachars) == StringRef::npos; 214 } 215 216 std::string Regex::escape(StringRef String) { 217 std::string RegexStr; 218 for (char C : String) { 219 if (strchr(RegexMetachars, C)) 220 RegexStr += '\\'; 221 RegexStr += C; 222 } 223 224 return RegexStr; 225 } 226