xref: /freebsd/contrib/llvm-project/llvm/lib/Support/Regex.cpp (revision b0d29bc47dba79f6f38e67eabadfb4b32ffd9390)
1 //===-- Regex.cpp - Regular Expression matcher implementation -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a POSIX regular expression matcher.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Support/Regex.h"
14 #include "llvm/ADT/SmallVector.h"
15 #include "llvm/ADT/StringRef.h"
16 #include "llvm/ADT/Twine.h"
17 #include <string>
18 
19 // Important this comes last because it defines "_REGEX_H_". At least on
20 // Darwin, if included before any header that (transitively) includes
21 // xlocale.h, this will cause trouble, because of missing regex-related types.
22 #include "regex_impl.h"
23 
24 using namespace llvm;
25 
26 Regex::Regex() : preg(nullptr), error(REG_BADPAT) {}
27 
28 Regex::Regex(StringRef regex, unsigned Flags) {
29   unsigned flags = 0;
30   preg = new llvm_regex();
31   preg->re_endp = regex.end();
32   if (Flags & IgnoreCase)
33     flags |= REG_ICASE;
34   if (Flags & Newline)
35     flags |= REG_NEWLINE;
36   if (!(Flags & BasicRegex))
37     flags |= REG_EXTENDED;
38   error = llvm_regcomp(preg, regex.data(), flags|REG_PEND);
39 }
40 
41 Regex::Regex(Regex &&regex) {
42   preg = regex.preg;
43   error = regex.error;
44   regex.preg = nullptr;
45   regex.error = REG_BADPAT;
46 }
47 
48 Regex::~Regex() {
49   if (preg) {
50     llvm_regfree(preg);
51     delete preg;
52   }
53 }
54 
55 namespace {
56 
57 /// Utility to convert a regex error code into a human-readable string.
58 void RegexErrorToString(int error, struct llvm_regex *preg,
59                         std::string &Error) {
60   size_t len = llvm_regerror(error, preg, nullptr, 0);
61 
62   Error.resize(len - 1);
63   llvm_regerror(error, preg, &Error[0], len);
64 }
65 
66 } // namespace
67 
68 bool Regex::isValid(std::string &Error) const {
69   if (!error)
70     return true;
71 
72   RegexErrorToString(error, preg, Error);
73   return false;
74 }
75 
76 /// getNumMatches - In a valid regex, return the number of parenthesized
77 /// matches it contains.
78 unsigned Regex::getNumMatches() const {
79   return preg->re_nsub;
80 }
81 
82 bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches,
83                   std::string *Error) const {
84   // Reset error, if given.
85   if (Error && !Error->empty())
86     *Error = "";
87 
88   // Check if the regex itself didn't successfully compile.
89   if (Error ? !isValid(*Error) : !isValid())
90     return false;
91 
92   unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
93 
94   // pmatch needs to have at least one element.
95   SmallVector<llvm_regmatch_t, 8> pm;
96   pm.resize(nmatch > 0 ? nmatch : 1);
97   pm[0].rm_so = 0;
98   pm[0].rm_eo = String.size();
99 
100   int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
101 
102   // Failure to match is not an error, it's just a normal return value.
103   // Any other error code is considered abnormal, and is logged in the Error.
104   if (rc == REG_NOMATCH)
105     return false;
106   if (rc != 0) {
107     if (Error)
108       RegexErrorToString(error, preg, *Error);
109     return false;
110   }
111 
112   // There was a match.
113 
114   if (Matches) { // match position requested
115     Matches->clear();
116 
117     for (unsigned i = 0; i != nmatch; ++i) {
118       if (pm[i].rm_so == -1) {
119         // this group didn't match
120         Matches->push_back(StringRef());
121         continue;
122       }
123       assert(pm[i].rm_eo >= pm[i].rm_so);
124       Matches->push_back(StringRef(String.data()+pm[i].rm_so,
125                                    pm[i].rm_eo-pm[i].rm_so));
126     }
127   }
128 
129   return true;
130 }
131 
132 std::string Regex::sub(StringRef Repl, StringRef String,
133                        std::string *Error) const {
134   SmallVector<StringRef, 8> Matches;
135 
136   // Return the input if there was no match.
137   if (!match(String, &Matches, Error))
138     return String;
139 
140   // Otherwise splice in the replacement string, starting with the prefix before
141   // the match.
142   std::string Res(String.begin(), Matches[0].begin());
143 
144   // Then the replacement string, honoring possible substitutions.
145   while (!Repl.empty()) {
146     // Skip to the next escape.
147     std::pair<StringRef, StringRef> Split = Repl.split('\\');
148 
149     // Add the skipped substring.
150     Res += Split.first;
151 
152     // Check for terminimation and trailing backslash.
153     if (Split.second.empty()) {
154       if (Repl.size() != Split.first.size() &&
155           Error && Error->empty())
156         *Error = "replacement string contained trailing backslash";
157       break;
158     }
159 
160     // Otherwise update the replacement string and interpret escapes.
161     Repl = Split.second;
162 
163     // FIXME: We should have a StringExtras function for mapping C99 escapes.
164     switch (Repl[0]) {
165       // Treat all unrecognized characters as self-quoting.
166     default:
167       Res += Repl[0];
168       Repl = Repl.substr(1);
169       break;
170 
171       // Single character escapes.
172     case 't':
173       Res += '\t';
174       Repl = Repl.substr(1);
175       break;
176     case 'n':
177       Res += '\n';
178       Repl = Repl.substr(1);
179       break;
180 
181       // Decimal escapes are backreferences.
182     case '0': case '1': case '2': case '3': case '4':
183     case '5': case '6': case '7': case '8': case '9': {
184       // Extract the backreference number.
185       StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789"));
186       Repl = Repl.substr(Ref.size());
187 
188       unsigned RefValue;
189       if (!Ref.getAsInteger(10, RefValue) &&
190           RefValue < Matches.size())
191         Res += Matches[RefValue];
192       else if (Error && Error->empty())
193         *Error = ("invalid backreference string '" + Twine(Ref) + "'").str();
194       break;
195     }
196     }
197   }
198 
199   // And finally the suffix.
200   Res += StringRef(Matches[0].end(), String.end() - Matches[0].end());
201 
202   return Res;
203 }
204 
205 // These are the special characters matched in functions like "p_ere_exp".
206 static const char RegexMetachars[] = "()^$|*+?.[]\\{}";
207 
208 bool Regex::isLiteralERE(StringRef Str) {
209   // Check for regex metacharacters.  This list was derived from our regex
210   // implementation in regcomp.c and double checked against the POSIX extended
211   // regular expression specification.
212   return Str.find_first_of(RegexMetachars) == StringRef::npos;
213 }
214 
215 std::string Regex::escape(StringRef String) {
216   std::string RegexStr;
217   for (unsigned i = 0, e = String.size(); i != e; ++i) {
218     if (strchr(RegexMetachars, String[i]))
219       RegexStr += '\\';
220     RegexStr += String[i];
221   }
222 
223   return RegexStr;
224 }
225