1 //===-- StringRef.cpp - Lightweight String References ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/ADT/StringRef.h" 10 #include "llvm/ADT/APFloat.h" 11 #include "llvm/ADT/APInt.h" 12 #include "llvm/ADT/Hashing.h" 13 #include "llvm/ADT/StringExtras.h" 14 #include "llvm/ADT/edit_distance.h" 15 #include "llvm/Support/Error.h" 16 #include <bitset> 17 18 using namespace llvm; 19 20 // MSVC emits references to this into the translation units which reference it. 21 #ifndef _MSC_VER 22 constexpr size_t StringRef::npos; 23 #endif 24 25 // strncasecmp() is not available on non-POSIX systems, so define an 26 // alternative function here. 27 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 28 for (size_t I = 0; I < Length; ++I) { 29 unsigned char LHC = toLower(LHS[I]); 30 unsigned char RHC = toLower(RHS[I]); 31 if (LHC != RHC) 32 return LHC < RHC ? -1 : 1; 33 } 34 return 0; 35 } 36 37 int StringRef::compare_insensitive(StringRef RHS) const { 38 if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length))) 39 return Res; 40 if (Length == RHS.Length) 41 return 0; 42 return Length < RHS.Length ? -1 : 1; 43 } 44 45 bool StringRef::starts_with_insensitive(StringRef Prefix) const { 46 return Length >= Prefix.Length && 47 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 48 } 49 50 bool StringRef::ends_with_insensitive(StringRef Suffix) const { 51 return Length >= Suffix.Length && 52 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 53 } 54 55 size_t StringRef::find_insensitive(char C, size_t From) const { 56 char L = toLower(C); 57 return find_if([L](char D) { return toLower(D) == L; }, From); 58 } 59 60 /// compare_numeric - Compare strings, handle embedded numbers. 61 int StringRef::compare_numeric(StringRef RHS) const { 62 for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) { 63 // Check for sequences of digits. 64 if (isDigit(Data[I]) && isDigit(RHS.Data[I])) { 65 // The longer sequence of numbers is considered larger. 66 // This doesn't really handle prefixed zeros well. 67 size_t J; 68 for (J = I + 1; J != E + 1; ++J) { 69 bool ld = J < Length && isDigit(Data[J]); 70 bool rd = J < RHS.Length && isDigit(RHS.Data[J]); 71 if (ld != rd) 72 return rd ? -1 : 1; 73 if (!rd) 74 break; 75 } 76 // The two number sequences have the same length (J-I), just memcmp them. 77 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 78 return Res < 0 ? -1 : 1; 79 // Identical number sequences, continue search after the numbers. 80 I = J - 1; 81 continue; 82 } 83 if (Data[I] != RHS.Data[I]) 84 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 85 } 86 if (Length == RHS.Length) 87 return 0; 88 return Length < RHS.Length ? -1 : 1; 89 } 90 91 // Compute the edit distance between the two given strings. 92 unsigned StringRef::edit_distance(llvm::StringRef Other, 93 bool AllowReplacements, 94 unsigned MaxEditDistance) const { 95 return llvm::ComputeEditDistance(ArrayRef(data(), size()), 96 ArrayRef(Other.data(), Other.size()), 97 AllowReplacements, MaxEditDistance); 98 } 99 100 unsigned llvm::StringRef::edit_distance_insensitive( 101 StringRef Other, bool AllowReplacements, unsigned MaxEditDistance) const { 102 return llvm::ComputeMappedEditDistance( 103 ArrayRef(data(), size()), ArrayRef(Other.data(), Other.size()), 104 llvm::toLower, AllowReplacements, MaxEditDistance); 105 } 106 107 //===----------------------------------------------------------------------===// 108 // String Operations 109 //===----------------------------------------------------------------------===// 110 111 std::string StringRef::lower() const { 112 return std::string(map_iterator(begin(), toLower), 113 map_iterator(end(), toLower)); 114 } 115 116 std::string StringRef::upper() const { 117 return std::string(map_iterator(begin(), toUpper), 118 map_iterator(end(), toUpper)); 119 } 120 121 //===----------------------------------------------------------------------===// 122 // String Searching 123 //===----------------------------------------------------------------------===// 124 125 126 /// find - Search for the first string \arg Str in the string. 127 /// 128 /// \return - The index of the first occurrence of \arg Str, or npos if not 129 /// found. 130 size_t StringRef::find(StringRef Str, size_t From) const { 131 if (From > Length) 132 return npos; 133 134 const char *Start = Data + From; 135 size_t Size = Length - From; 136 137 const char *Needle = Str.data(); 138 size_t N = Str.size(); 139 if (N == 0) 140 return From; 141 if (Size < N) 142 return npos; 143 if (N == 1) { 144 const char *Ptr = (const char *)::memchr(Start, Needle[0], Size); 145 return Ptr == nullptr ? npos : Ptr - Data; 146 } 147 148 const char *Stop = Start + (Size - N + 1); 149 150 if (N == 2) { 151 // Provide a fast path for newline finding (CRLF case) in InclusionRewriter. 152 // Not the most optimized strategy, but getting memcmp inlined should be 153 // good enough. 154 do { 155 if (std::memcmp(Start, Needle, 2) == 0) 156 return Start - Data; 157 ++Start; 158 } while (Start < Stop); 159 return npos; 160 } 161 162 // For short haystacks or unsupported needles fall back to the naive algorithm 163 if (Size < 16 || N > 255) { 164 do { 165 if (std::memcmp(Start, Needle, N) == 0) 166 return Start - Data; 167 ++Start; 168 } while (Start < Stop); 169 return npos; 170 } 171 172 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 173 uint8_t BadCharSkip[256]; 174 std::memset(BadCharSkip, N, 256); 175 for (unsigned i = 0; i != N-1; ++i) 176 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 177 178 do { 179 uint8_t Last = Start[N - 1]; 180 if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - 1])) 181 if (std::memcmp(Start, Needle, N - 1) == 0) 182 return Start - Data; 183 184 // Otherwise skip the appropriate number of bytes. 185 Start += BadCharSkip[Last]; 186 } while (Start < Stop); 187 188 return npos; 189 } 190 191 size_t StringRef::find_insensitive(StringRef Str, size_t From) const { 192 StringRef This = substr(From); 193 while (This.size() >= Str.size()) { 194 if (This.starts_with_insensitive(Str)) 195 return From; 196 This = This.drop_front(); 197 ++From; 198 } 199 return npos; 200 } 201 202 size_t StringRef::rfind_insensitive(char C, size_t From) const { 203 From = std::min(From, Length); 204 size_t i = From; 205 while (i != 0) { 206 --i; 207 if (toLower(Data[i]) == toLower(C)) 208 return i; 209 } 210 return npos; 211 } 212 213 /// rfind - Search for the last string \arg Str in the string. 214 /// 215 /// \return - The index of the last occurrence of \arg Str, or npos if not 216 /// found. 217 size_t StringRef::rfind(StringRef Str) const { 218 return std::string_view(*this).rfind(Str); 219 } 220 221 size_t StringRef::rfind_insensitive(StringRef Str) const { 222 size_t N = Str.size(); 223 if (N > Length) 224 return npos; 225 for (size_t i = Length - N + 1, e = 0; i != e;) { 226 --i; 227 if (substr(i, N).equals_insensitive(Str)) 228 return i; 229 } 230 return npos; 231 } 232 233 /// find_first_of - Find the first character in the string that is in \arg 234 /// Chars, or npos if not found. 235 /// 236 /// Note: O(size() + Chars.size()) 237 StringRef::size_type StringRef::find_first_of(StringRef Chars, 238 size_t From) const { 239 std::bitset<1 << CHAR_BIT> CharBits; 240 for (char C : Chars) 241 CharBits.set((unsigned char)C); 242 243 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 244 if (CharBits.test((unsigned char)Data[i])) 245 return i; 246 return npos; 247 } 248 249 /// find_first_not_of - Find the first character in the string that is not 250 /// \arg C or npos if not found. 251 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 252 return std::string_view(*this).find_first_not_of(C, From); 253 } 254 255 /// find_first_not_of - Find the first character in the string that is not 256 /// in the string \arg Chars, or npos if not found. 257 /// 258 /// Note: O(size() + Chars.size()) 259 StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 260 size_t From) const { 261 std::bitset<1 << CHAR_BIT> CharBits; 262 for (char C : Chars) 263 CharBits.set((unsigned char)C); 264 265 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 266 if (!CharBits.test((unsigned char)Data[i])) 267 return i; 268 return npos; 269 } 270 271 /// find_last_of - Find the last character in the string that is in \arg C, 272 /// or npos if not found. 273 /// 274 /// Note: O(size() + Chars.size()) 275 StringRef::size_type StringRef::find_last_of(StringRef Chars, 276 size_t From) const { 277 std::bitset<1 << CHAR_BIT> CharBits; 278 for (char C : Chars) 279 CharBits.set((unsigned char)C); 280 281 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 282 if (CharBits.test((unsigned char)Data[i])) 283 return i; 284 return npos; 285 } 286 287 /// find_last_not_of - Find the last character in the string that is not 288 /// \arg C, or npos if not found. 289 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 290 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 291 if (Data[i] != C) 292 return i; 293 return npos; 294 } 295 296 /// find_last_not_of - Find the last character in the string that is not in 297 /// \arg Chars, or npos if not found. 298 /// 299 /// Note: O(size() + Chars.size()) 300 StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 301 size_t From) const { 302 std::bitset<1 << CHAR_BIT> CharBits; 303 for (char C : Chars) 304 CharBits.set((unsigned char)C); 305 306 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 307 if (!CharBits.test((unsigned char)Data[i])) 308 return i; 309 return npos; 310 } 311 312 void StringRef::split(SmallVectorImpl<StringRef> &A, 313 StringRef Separator, int MaxSplit, 314 bool KeepEmpty) const { 315 StringRef S = *this; 316 317 // Count down from MaxSplit. When MaxSplit is -1, this will just split 318 // "forever". This doesn't support splitting more than 2^31 times 319 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 320 // but that seems unlikely to be useful. 321 while (MaxSplit-- != 0) { 322 size_t Idx = S.find(Separator); 323 if (Idx == npos) 324 break; 325 326 // Push this split. 327 if (KeepEmpty || Idx > 0) 328 A.push_back(S.slice(0, Idx)); 329 330 // Jump forward. 331 S = S.slice(Idx + Separator.size(), npos); 332 } 333 334 // Push the tail. 335 if (KeepEmpty || !S.empty()) 336 A.push_back(S); 337 } 338 339 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, 340 int MaxSplit, bool KeepEmpty) const { 341 StringRef S = *this; 342 343 // Count down from MaxSplit. When MaxSplit is -1, this will just split 344 // "forever". This doesn't support splitting more than 2^31 times 345 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 346 // but that seems unlikely to be useful. 347 while (MaxSplit-- != 0) { 348 size_t Idx = S.find(Separator); 349 if (Idx == npos) 350 break; 351 352 // Push this split. 353 if (KeepEmpty || Idx > 0) 354 A.push_back(S.slice(0, Idx)); 355 356 // Jump forward. 357 S = S.slice(Idx + 1, npos); 358 } 359 360 // Push the tail. 361 if (KeepEmpty || !S.empty()) 362 A.push_back(S); 363 } 364 365 //===----------------------------------------------------------------------===// 366 // Helpful Algorithms 367 //===----------------------------------------------------------------------===// 368 369 /// count - Return the number of non-overlapped occurrences of \arg Str in 370 /// the string. 371 size_t StringRef::count(StringRef Str) const { 372 size_t Count = 0; 373 size_t Pos = 0; 374 size_t N = Str.size(); 375 // TODO: For an empty `Str` we return 0 for legacy reasons. Consider changing 376 // this to `Length + 1` which is more in-line with the function 377 // description. 378 if (!N) 379 return 0; 380 while ((Pos = find(Str, Pos)) != npos) { 381 ++Count; 382 Pos += N; 383 } 384 return Count; 385 } 386 387 static unsigned GetAutoSenseRadix(StringRef &Str) { 388 if (Str.empty()) 389 return 10; 390 391 if (Str.consume_front_insensitive("0x")) 392 return 16; 393 394 if (Str.consume_front_insensitive("0b")) 395 return 2; 396 397 if (Str.consume_front("0o")) 398 return 8; 399 400 if (Str[0] == '0' && Str.size() > 1 && isDigit(Str[1])) { 401 Str = Str.substr(1); 402 return 8; 403 } 404 405 return 10; 406 } 407 408 bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix, 409 unsigned long long &Result) { 410 // Autosense radix if not specified. 411 if (Radix == 0) 412 Radix = GetAutoSenseRadix(Str); 413 414 // Empty strings (after the radix autosense) are invalid. 415 if (Str.empty()) return true; 416 417 // Parse all the bytes of the string given this radix. Watch for overflow. 418 StringRef Str2 = Str; 419 Result = 0; 420 while (!Str2.empty()) { 421 unsigned CharVal; 422 if (Str2[0] >= '0' && Str2[0] <= '9') 423 CharVal = Str2[0] - '0'; 424 else if (Str2[0] >= 'a' && Str2[0] <= 'z') 425 CharVal = Str2[0] - 'a' + 10; 426 else if (Str2[0] >= 'A' && Str2[0] <= 'Z') 427 CharVal = Str2[0] - 'A' + 10; 428 else 429 break; 430 431 // If the parsed value is larger than the integer radix, we cannot 432 // consume any more characters. 433 if (CharVal >= Radix) 434 break; 435 436 // Add in this character. 437 unsigned long long PrevResult = Result; 438 Result = Result * Radix + CharVal; 439 440 // Check for overflow by shifting back and seeing if bits were lost. 441 if (Result / Radix < PrevResult) 442 return true; 443 444 Str2 = Str2.substr(1); 445 } 446 447 // We consider the operation a failure if no characters were consumed 448 // successfully. 449 if (Str.size() == Str2.size()) 450 return true; 451 452 Str = Str2; 453 return false; 454 } 455 456 bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix, 457 long long &Result) { 458 unsigned long long ULLVal; 459 460 // Handle positive strings first. 461 if (!Str.starts_with("-")) { 462 if (consumeUnsignedInteger(Str, Radix, ULLVal) || 463 // Check for value so large it overflows a signed value. 464 (long long)ULLVal < 0) 465 return true; 466 Result = ULLVal; 467 return false; 468 } 469 470 // Get the positive part of the value. 471 StringRef Str2 = Str.drop_front(1); 472 if (consumeUnsignedInteger(Str2, Radix, ULLVal) || 473 // Reject values so large they'd overflow as negative signed, but allow 474 // "-0". This negates the unsigned so that the negative isn't undefined 475 // on signed overflow. 476 (long long)-ULLVal > 0) 477 return true; 478 479 Str = Str2; 480 Result = -ULLVal; 481 return false; 482 } 483 484 /// GetAsUnsignedInteger - Workhorse method that converts a integer character 485 /// sequence of radix up to 36 to an unsigned long long value. 486 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 487 unsigned long long &Result) { 488 if (consumeUnsignedInteger(Str, Radix, Result)) 489 return true; 490 491 // For getAsUnsignedInteger, we require the whole string to be consumed or 492 // else we consider it a failure. 493 return !Str.empty(); 494 } 495 496 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 497 long long &Result) { 498 if (consumeSignedInteger(Str, Radix, Result)) 499 return true; 500 501 // For getAsSignedInteger, we require the whole string to be consumed or else 502 // we consider it a failure. 503 return !Str.empty(); 504 } 505 506 bool StringRef::consumeInteger(unsigned Radix, APInt &Result) { 507 StringRef Str = *this; 508 509 // Autosense radix if not specified. 510 if (Radix == 0) 511 Radix = GetAutoSenseRadix(Str); 512 513 assert(Radix > 1 && Radix <= 36); 514 515 // Empty strings (after the radix autosense) are invalid. 516 if (Str.empty()) return true; 517 518 // Skip leading zeroes. This can be a significant improvement if 519 // it means we don't need > 64 bits. 520 Str = Str.ltrim('0'); 521 522 // If it was nothing but zeroes.... 523 if (Str.empty()) { 524 Result = APInt(64, 0); 525 *this = Str; 526 return false; 527 } 528 529 // (Over-)estimate the required number of bits. 530 unsigned Log2Radix = 0; 531 while ((1U << Log2Radix) < Radix) Log2Radix++; 532 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 533 534 unsigned BitWidth = Log2Radix * Str.size(); 535 if (BitWidth < Result.getBitWidth()) 536 BitWidth = Result.getBitWidth(); // don't shrink the result 537 else if (BitWidth > Result.getBitWidth()) 538 Result = Result.zext(BitWidth); 539 540 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 541 if (!IsPowerOf2Radix) { 542 // These must have the same bit-width as Result. 543 RadixAP = APInt(BitWidth, Radix); 544 CharAP = APInt(BitWidth, 0); 545 } 546 547 // Parse all the bytes of the string given this radix. 548 Result = 0; 549 while (!Str.empty()) { 550 unsigned CharVal; 551 if (Str[0] >= '0' && Str[0] <= '9') 552 CharVal = Str[0]-'0'; 553 else if (Str[0] >= 'a' && Str[0] <= 'z') 554 CharVal = Str[0]-'a'+10; 555 else if (Str[0] >= 'A' && Str[0] <= 'Z') 556 CharVal = Str[0]-'A'+10; 557 else 558 break; 559 560 // If the parsed value is larger than the integer radix, the string is 561 // invalid. 562 if (CharVal >= Radix) 563 break; 564 565 // Add in this character. 566 if (IsPowerOf2Radix) { 567 Result <<= Log2Radix; 568 Result |= CharVal; 569 } else { 570 Result *= RadixAP; 571 CharAP = CharVal; 572 Result += CharAP; 573 } 574 575 Str = Str.substr(1); 576 } 577 578 // We consider the operation a failure if no characters were consumed 579 // successfully. 580 if (size() == Str.size()) 581 return true; 582 583 *this = Str; 584 return false; 585 } 586 587 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 588 StringRef Str = *this; 589 if (Str.consumeInteger(Radix, Result)) 590 return true; 591 592 // For getAsInteger, we require the whole string to be consumed or else we 593 // consider it a failure. 594 return !Str.empty(); 595 } 596 597 bool StringRef::getAsDouble(double &Result, bool AllowInexact) const { 598 APFloat F(0.0); 599 auto StatusOrErr = F.convertFromString(*this, APFloat::rmNearestTiesToEven); 600 if (errorToBool(StatusOrErr.takeError())) 601 return true; 602 603 APFloat::opStatus Status = *StatusOrErr; 604 if (Status != APFloat::opOK) { 605 if (!AllowInexact || !(Status & APFloat::opInexact)) 606 return true; 607 } 608 609 Result = F.convertToDouble(); 610 return false; 611 } 612 613 // Implementation of StringRef hashing. 614 hash_code llvm::hash_value(StringRef S) { 615 return hash_combine_range(S.begin(), S.end()); 616 } 617 618 unsigned DenseMapInfo<StringRef, void>::getHashValue(StringRef Val) { 619 assert(Val.data() != getEmptyKey().data() && 620 "Cannot hash the empty key!"); 621 assert(Val.data() != getTombstoneKey().data() && 622 "Cannot hash the tombstone key!"); 623 return (unsigned)(hash_value(Val)); 624 } 625