1 //===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a glob pattern matcher. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/GlobPattern.h" 14 #include "llvm/ADT/StringRef.h" 15 #include "llvm/Support/Errc.h" 16 17 using namespace llvm; 18 19 // Expands character ranges and returns a bitmap. 20 // For example, "a-cf-hz" is expanded to "abcfghz". 21 static Expected<BitVector> expand(StringRef S, StringRef Original) { 22 BitVector BV(256, false); 23 24 // Expand X-Y. 25 for (;;) { 26 if (S.size() < 3) 27 break; 28 29 uint8_t Start = S[0]; 30 uint8_t End = S[2]; 31 32 // If it doesn't start with something like X-Y, 33 // consume the first character and proceed. 34 if (S[1] != '-') { 35 BV[Start] = true; 36 S = S.substr(1); 37 continue; 38 } 39 40 // It must be in the form of X-Y. 41 // Validate it and then interpret the range. 42 if (Start > End) 43 return make_error<StringError>("invalid glob pattern: " + Original, 44 errc::invalid_argument); 45 46 for (int C = Start; C <= End; ++C) 47 BV[(uint8_t)C] = true; 48 S = S.substr(3); 49 } 50 51 for (char C : S) 52 BV[(uint8_t)C] = true; 53 return BV; 54 } 55 56 // Identify brace expansions in S and return the list of patterns they expand 57 // into. 58 static Expected<SmallVector<std::string, 1>> 59 parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) { 60 SmallVector<std::string> SubPatterns = {S.str()}; 61 if (!MaxSubPatterns || !S.contains('{')) 62 return std::move(SubPatterns); 63 64 struct BraceExpansion { 65 size_t Start; 66 size_t Length; 67 SmallVector<StringRef, 2> Terms; 68 }; 69 SmallVector<BraceExpansion, 0> BraceExpansions; 70 71 BraceExpansion *CurrentBE = nullptr; 72 size_t TermBegin; 73 for (size_t I = 0, E = S.size(); I != E; ++I) { 74 if (S[I] == '[') { 75 I = S.find(']', I + 2); 76 if (I == std::string::npos) 77 return make_error<StringError>("invalid glob pattern, unmatched '['", 78 errc::invalid_argument); 79 } else if (S[I] == '{') { 80 if (CurrentBE) 81 return make_error<StringError>( 82 "nested brace expansions are not supported", 83 errc::invalid_argument); 84 CurrentBE = &BraceExpansions.emplace_back(); 85 CurrentBE->Start = I; 86 TermBegin = I + 1; 87 } else if (S[I] == ',') { 88 if (!CurrentBE) 89 continue; 90 CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin)); 91 TermBegin = I + 1; 92 } else if (S[I] == '}') { 93 if (!CurrentBE) 94 continue; 95 if (CurrentBE->Terms.empty()) 96 return make_error<StringError>( 97 "empty or singleton brace expansions are not supported", 98 errc::invalid_argument); 99 CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin)); 100 CurrentBE->Length = I - CurrentBE->Start + 1; 101 CurrentBE = nullptr; 102 } else if (S[I] == '\\') { 103 if (++I == E) 104 return make_error<StringError>("invalid glob pattern, stray '\\'", 105 errc::invalid_argument); 106 } 107 } 108 if (CurrentBE) 109 return make_error<StringError>("incomplete brace expansion", 110 errc::invalid_argument); 111 112 size_t NumSubPatterns = 1; 113 for (auto &BE : BraceExpansions) { 114 if (NumSubPatterns > std::numeric_limits<size_t>::max() / BE.Terms.size()) { 115 NumSubPatterns = std::numeric_limits<size_t>::max(); 116 break; 117 } 118 NumSubPatterns *= BE.Terms.size(); 119 } 120 if (NumSubPatterns > *MaxSubPatterns) 121 return make_error<StringError>("too many brace expansions", 122 errc::invalid_argument); 123 // Replace brace expansions in reverse order so that we don't invalidate 124 // earlier start indices 125 for (auto &BE : reverse(BraceExpansions)) { 126 SmallVector<std::string> OrigSubPatterns; 127 std::swap(SubPatterns, OrigSubPatterns); 128 for (StringRef Term : BE.Terms) 129 for (StringRef Orig : OrigSubPatterns) 130 SubPatterns.emplace_back(Orig).replace(BE.Start, BE.Length, Term); 131 } 132 return std::move(SubPatterns); 133 } 134 135 Expected<GlobPattern> 136 GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) { 137 GlobPattern Pat; 138 139 // Store the prefix that does not contain any metacharacter. 140 size_t PrefixSize = S.find_first_of("?*[{\\"); 141 Pat.Prefix = S.substr(0, PrefixSize); 142 if (PrefixSize == std::string::npos) 143 return Pat; 144 S = S.substr(PrefixSize); 145 146 SmallVector<std::string, 1> SubPats; 147 if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(SubPats)) 148 return std::move(Err); 149 for (StringRef SubPat : SubPats) { 150 auto SubGlobOrErr = SubGlobPattern::create(SubPat); 151 if (!SubGlobOrErr) 152 return SubGlobOrErr.takeError(); 153 Pat.SubGlobs.push_back(*SubGlobOrErr); 154 } 155 156 return Pat; 157 } 158 159 Expected<GlobPattern::SubGlobPattern> 160 GlobPattern::SubGlobPattern::create(StringRef S) { 161 SubGlobPattern Pat; 162 163 // Parse brackets. 164 Pat.Pat.assign(S.begin(), S.end()); 165 for (size_t I = 0, E = S.size(); I != E; ++I) { 166 if (S[I] == '[') { 167 // ']' is allowed as the first character of a character class. '[]' is 168 // invalid. So, just skip the first character. 169 ++I; 170 size_t J = S.find(']', I + 1); 171 if (J == StringRef::npos) 172 return make_error<StringError>("invalid glob pattern, unmatched '['", 173 errc::invalid_argument); 174 StringRef Chars = S.substr(I, J - I); 175 bool Invert = S[I] == '^' || S[I] == '!'; 176 Expected<BitVector> BV = 177 Invert ? expand(Chars.substr(1), S) : expand(Chars, S); 178 if (!BV) 179 return BV.takeError(); 180 if (Invert) 181 BV->flip(); 182 Pat.Brackets.push_back(Bracket{J + 1, std::move(*BV)}); 183 I = J; 184 } else if (S[I] == '\\') { 185 if (++I == E) 186 return make_error<StringError>("invalid glob pattern, stray '\\'", 187 errc::invalid_argument); 188 } 189 } 190 return Pat; 191 } 192 193 bool GlobPattern::match(StringRef S) const { 194 if (!S.consume_front(Prefix)) 195 return false; 196 if (SubGlobs.empty() && S.empty()) 197 return true; 198 for (auto &Glob : SubGlobs) 199 if (Glob.match(S)) 200 return true; 201 return false; 202 } 203 204 // Factor the pattern into segments split by '*'. The segment is matched 205 // sequentianlly by finding the first occurrence past the end of the previous 206 // match. 207 bool GlobPattern::SubGlobPattern::match(StringRef Str) const { 208 const char *P = Pat.data(), *SegmentBegin = nullptr, *S = Str.data(), 209 *SavedS = S; 210 const char *const PEnd = P + Pat.size(), *const End = S + Str.size(); 211 size_t B = 0, SavedB = 0; 212 while (S != End) { 213 if (P == PEnd) 214 ; 215 else if (*P == '*') { 216 // The non-* substring on the left of '*' matches the tail of S. Save the 217 // positions to be used by backtracking if we see a mismatch later. 218 SegmentBegin = ++P; 219 SavedS = S; 220 SavedB = B; 221 continue; 222 } else if (*P == '[') { 223 if (Brackets[B].Bytes[uint8_t(*S)]) { 224 P = Pat.data() + Brackets[B++].NextOffset; 225 ++S; 226 continue; 227 } 228 } else if (*P == '\\') { 229 if (*++P == *S) { 230 ++P; 231 ++S; 232 continue; 233 } 234 } else if (*P == *S || *P == '?') { 235 ++P; 236 ++S; 237 continue; 238 } 239 if (!SegmentBegin) 240 return false; 241 // We have seen a '*'. Backtrack to the saved positions. Shift the S 242 // position to probe the next starting position in the segment. 243 P = SegmentBegin; 244 S = ++SavedS; 245 B = SavedB; 246 } 247 // All bytes in Str have been matched. Return true if the rest part of Pat is 248 // empty or contains only '*'. 249 return getPat().find_first_not_of('*', P - Pat.data()) == std::string::npos; 250 } 251