1 //===-- Regex.cpp - Regular Expression matcher implementation -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a POSIX regular expression matcher. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/Regex.h" 14 #include "llvm/ADT/SmallVector.h" 15 #include "llvm/ADT/StringRef.h" 16 #include "llvm/ADT/Twine.h" 17 #include <string> 18 19 // Important this comes last because it defines "_REGEX_H_". At least on 20 // Darwin, if included before any header that (transitively) includes 21 // xlocale.h, this will cause trouble, because of missing regex-related types. 22 #include "regex_impl.h" 23 24 using namespace llvm; 25 26 Regex::Regex() : preg(nullptr), error(REG_BADPAT) {} 27 28 Regex::Regex(StringRef regex, unsigned Flags) { 29 unsigned flags = 0; 30 preg = new llvm_regex(); 31 preg->re_endp = regex.end(); 32 if (Flags & IgnoreCase) 33 flags |= REG_ICASE; 34 if (Flags & Newline) 35 flags |= REG_NEWLINE; 36 if (!(Flags & BasicRegex)) 37 flags |= REG_EXTENDED; 38 error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); 39 } 40 41 Regex::Regex(Regex &®ex) { 42 preg = regex.preg; 43 error = regex.error; 44 regex.preg = nullptr; 45 regex.error = REG_BADPAT; 46 } 47 48 Regex::~Regex() { 49 if (preg) { 50 llvm_regfree(preg); 51 delete preg; 52 } 53 } 54 55 bool Regex::isValid(std::string &Error) const { 56 if (!error) 57 return true; 58 59 size_t len = llvm_regerror(error, preg, nullptr, 0); 60 61 Error.resize(len - 1); 62 llvm_regerror(error, preg, &Error[0], len); 63 return false; 64 } 65 66 /// getNumMatches - In a valid regex, return the number of parenthesized 67 /// matches it contains. 68 unsigned Regex::getNumMatches() const { 69 return preg->re_nsub; 70 } 71 72 bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){ 73 if (error) 74 return false; 75 76 unsigned nmatch = Matches ? preg->re_nsub+1 : 0; 77 78 // pmatch needs to have at least one element. 79 SmallVector<llvm_regmatch_t, 8> pm; 80 pm.resize(nmatch > 0 ? nmatch : 1); 81 pm[0].rm_so = 0; 82 pm[0].rm_eo = String.size(); 83 84 int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND); 85 86 if (rc == REG_NOMATCH) 87 return false; 88 if (rc != 0) { 89 // regexec can fail due to invalid pattern or running out of memory. 90 error = rc; 91 return false; 92 } 93 94 // There was a match. 95 96 if (Matches) { // match position requested 97 Matches->clear(); 98 99 for (unsigned i = 0; i != nmatch; ++i) { 100 if (pm[i].rm_so == -1) { 101 // this group didn't match 102 Matches->push_back(StringRef()); 103 continue; 104 } 105 assert(pm[i].rm_eo >= pm[i].rm_so); 106 Matches->push_back(StringRef(String.data()+pm[i].rm_so, 107 pm[i].rm_eo-pm[i].rm_so)); 108 } 109 } 110 111 return true; 112 } 113 114 std::string Regex::sub(StringRef Repl, StringRef String, 115 std::string *Error) { 116 SmallVector<StringRef, 8> Matches; 117 118 // Reset error, if given. 119 if (Error && !Error->empty()) *Error = ""; 120 121 // Return the input if there was no match. 122 if (!match(String, &Matches)) 123 return String; 124 125 // Otherwise splice in the replacement string, starting with the prefix before 126 // the match. 127 std::string Res(String.begin(), Matches[0].begin()); 128 129 // Then the replacement string, honoring possible substitutions. 130 while (!Repl.empty()) { 131 // Skip to the next escape. 132 std::pair<StringRef, StringRef> Split = Repl.split('\\'); 133 134 // Add the skipped substring. 135 Res += Split.first; 136 137 // Check for terminimation and trailing backslash. 138 if (Split.second.empty()) { 139 if (Repl.size() != Split.first.size() && 140 Error && Error->empty()) 141 *Error = "replacement string contained trailing backslash"; 142 break; 143 } 144 145 // Otherwise update the replacement string and interpret escapes. 146 Repl = Split.second; 147 148 // FIXME: We should have a StringExtras function for mapping C99 escapes. 149 switch (Repl[0]) { 150 // Treat all unrecognized characters as self-quoting. 151 default: 152 Res += Repl[0]; 153 Repl = Repl.substr(1); 154 break; 155 156 // Single character escapes. 157 case 't': 158 Res += '\t'; 159 Repl = Repl.substr(1); 160 break; 161 case 'n': 162 Res += '\n'; 163 Repl = Repl.substr(1); 164 break; 165 166 // Decimal escapes are backreferences. 167 case '0': case '1': case '2': case '3': case '4': 168 case '5': case '6': case '7': case '8': case '9': { 169 // Extract the backreference number. 170 StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789")); 171 Repl = Repl.substr(Ref.size()); 172 173 unsigned RefValue; 174 if (!Ref.getAsInteger(10, RefValue) && 175 RefValue < Matches.size()) 176 Res += Matches[RefValue]; 177 else if (Error && Error->empty()) 178 *Error = ("invalid backreference string '" + Twine(Ref) + "'").str(); 179 break; 180 } 181 } 182 } 183 184 // And finally the suffix. 185 Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); 186 187 return Res; 188 } 189 190 // These are the special characters matched in functions like "p_ere_exp". 191 static const char RegexMetachars[] = "()^$|*+?.[]\\{}"; 192 193 bool Regex::isLiteralERE(StringRef Str) { 194 // Check for regex metacharacters. This list was derived from our regex 195 // implementation in regcomp.c and double checked against the POSIX extended 196 // regular expression specification. 197 return Str.find_first_of(RegexMetachars) == StringRef::npos; 198 } 199 200 std::string Regex::escape(StringRef String) { 201 std::string RegexStr; 202 for (unsigned i = 0, e = String.size(); i != e; ++i) { 203 if (strchr(RegexMetachars, String[i])) 204 RegexStr += '\\'; 205 RegexStr += String[i]; 206 } 207 208 return RegexStr; 209 } 210