1 //===-- StringRef.cpp - Lightweight String References ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/ADT/StringRef.h" 10 #include "llvm/ADT/APFloat.h" 11 #include "llvm/ADT/APInt.h" 12 #include "llvm/ADT/Hashing.h" 13 #include "llvm/ADT/StringExtras.h" 14 #include "llvm/ADT/edit_distance.h" 15 #include "llvm/Support/Error.h" 16 #include <bitset> 17 18 using namespace llvm; 19 20 // MSVC emits references to this into the translation units which reference it. 21 #ifndef _MSC_VER 22 constexpr size_t StringRef::npos; 23 #endif 24 25 // strncasecmp() is not available on non-POSIX systems, so define an 26 // alternative function here. 27 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 28 for (size_t I = 0; I < Length; ++I) { 29 unsigned char LHC = toLower(LHS[I]); 30 unsigned char RHC = toLower(RHS[I]); 31 if (LHC != RHC) 32 return LHC < RHC ? -1 : 1; 33 } 34 return 0; 35 } 36 37 int StringRef::compare_insensitive(StringRef RHS) const { 38 if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length))) 39 return Res; 40 if (Length == RHS.Length) 41 return 0; 42 return Length < RHS.Length ? -1 : 1; 43 } 44 45 bool StringRef::starts_with_insensitive(StringRef Prefix) const { 46 return Length >= Prefix.Length && 47 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 48 } 49 50 bool StringRef::ends_with_insensitive(StringRef Suffix) const { 51 return Length >= Suffix.Length && 52 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 53 } 54 55 size_t StringRef::find_insensitive(char C, size_t From) const { 56 char L = toLower(C); 57 return find_if([L](char D) { return toLower(D) == L; }, From); 58 } 59 60 /// compare_numeric - Compare strings, handle embedded numbers. 61 int StringRef::compare_numeric(StringRef RHS) const { 62 for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) { 63 // Check for sequences of digits. 64 if (isDigit(Data[I]) && isDigit(RHS.Data[I])) { 65 // The longer sequence of numbers is considered larger. 66 // This doesn't really handle prefixed zeros well. 67 size_t J; 68 for (J = I + 1; J != E + 1; ++J) { 69 bool ld = J < Length && isDigit(Data[J]); 70 bool rd = J < RHS.Length && isDigit(RHS.Data[J]); 71 if (ld != rd) 72 return rd ? -1 : 1; 73 if (!rd) 74 break; 75 } 76 // The two number sequences have the same length (J-I), just memcmp them. 77 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 78 return Res < 0 ? -1 : 1; 79 // Identical number sequences, continue search after the numbers. 80 I = J - 1; 81 continue; 82 } 83 if (Data[I] != RHS.Data[I]) 84 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 85 } 86 if (Length == RHS.Length) 87 return 0; 88 return Length < RHS.Length ? -1 : 1; 89 } 90 91 // Compute the edit distance between the two given strings. 92 unsigned StringRef::edit_distance(llvm::StringRef Other, 93 bool AllowReplacements, 94 unsigned MaxEditDistance) const { 95 return llvm::ComputeEditDistance(ArrayRef(data(), size()), 96 ArrayRef(Other.data(), Other.size()), 97 AllowReplacements, MaxEditDistance); 98 } 99 100 unsigned llvm::StringRef::edit_distance_insensitive( 101 StringRef Other, bool AllowReplacements, unsigned MaxEditDistance) const { 102 return llvm::ComputeMappedEditDistance( 103 ArrayRef(data(), size()), ArrayRef(Other.data(), Other.size()), 104 llvm::toLower, AllowReplacements, MaxEditDistance); 105 } 106 107 //===----------------------------------------------------------------------===// 108 // String Operations 109 //===----------------------------------------------------------------------===// 110 111 std::string StringRef::lower() const { 112 return std::string(map_iterator(begin(), toLower), 113 map_iterator(end(), toLower)); 114 } 115 116 std::string StringRef::upper() const { 117 return std::string(map_iterator(begin(), toUpper), 118 map_iterator(end(), toUpper)); 119 } 120 121 //===----------------------------------------------------------------------===// 122 // String Searching 123 //===----------------------------------------------------------------------===// 124 125 126 /// find - Search for the first string \arg Str in the string. 127 /// 128 /// \return - The index of the first occurrence of \arg Str, or npos if not 129 /// found. 130 size_t StringRef::find(StringRef Str, size_t From) const { 131 if (From > Length) 132 return npos; 133 134 const char *Start = Data + From; 135 size_t Size = Length - From; 136 137 const char *Needle = Str.data(); 138 size_t N = Str.size(); 139 if (N == 0) 140 return From; 141 if (Size < N) 142 return npos; 143 if (N == 1) { 144 const char *Ptr = (const char *)::memchr(Start, Needle[0], Size); 145 return Ptr == nullptr ? npos : Ptr - Data; 146 } 147 148 const char *Stop = Start + (Size - N + 1); 149 150 if (N == 2) { 151 // Provide a fast path for newline finding (CRLF case) in InclusionRewriter. 152 // Not the most optimized strategy, but getting memcmp inlined should be 153 // good enough. 154 do { 155 if (std::memcmp(Start, Needle, 2) == 0) 156 return Start - Data; 157 ++Start; 158 } while (Start < Stop); 159 return npos; 160 } 161 162 // For short haystacks or unsupported needles fall back to the naive algorithm 163 if (Size < 16 || N > 255) { 164 do { 165 if (std::memcmp(Start, Needle, N) == 0) 166 return Start - Data; 167 ++Start; 168 } while (Start < Stop); 169 return npos; 170 } 171 172 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 173 uint8_t BadCharSkip[256]; 174 std::memset(BadCharSkip, N, 256); 175 for (unsigned i = 0; i != N-1; ++i) 176 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 177 178 do { 179 uint8_t Last = Start[N - 1]; 180 if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - 1])) 181 if (std::memcmp(Start, Needle, N - 1) == 0) 182 return Start - Data; 183 184 // Otherwise skip the appropriate number of bytes. 185 Start += BadCharSkip[Last]; 186 } while (Start < Stop); 187 188 return npos; 189 } 190 191 size_t StringRef::find_insensitive(StringRef Str, size_t From) const { 192 StringRef This = substr(From); 193 while (This.size() >= Str.size()) { 194 if (This.starts_with_insensitive(Str)) 195 return From; 196 This = This.drop_front(); 197 ++From; 198 } 199 return npos; 200 } 201 202 size_t StringRef::rfind_insensitive(char C, size_t From) const { 203 From = std::min(From, Length); 204 size_t i = From; 205 while (i != 0) { 206 --i; 207 if (toLower(Data[i]) == toLower(C)) 208 return i; 209 } 210 return npos; 211 } 212 213 /// rfind - Search for the last string \arg Str in the string. 214 /// 215 /// \return - The index of the last occurrence of \arg Str, or npos if not 216 /// found. 217 size_t StringRef::rfind(StringRef Str) const { 218 return std::string_view(*this).rfind(Str); 219 } 220 221 size_t StringRef::rfind_insensitive(StringRef Str) const { 222 size_t N = Str.size(); 223 if (N > Length) 224 return npos; 225 for (size_t i = Length - N + 1, e = 0; i != e;) { 226 --i; 227 if (substr(i, N).equals_insensitive(Str)) 228 return i; 229 } 230 return npos; 231 } 232 233 /// find_first_of - Find the first character in the string that is in \arg 234 /// Chars, or npos if not found. 235 /// 236 /// Note: O(size() + Chars.size()) 237 StringRef::size_type StringRef::find_first_of(StringRef Chars, 238 size_t From) const { 239 std::bitset<1 << CHAR_BIT> CharBits; 240 for (char C : Chars) 241 CharBits.set((unsigned char)C); 242 243 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 244 if (CharBits.test((unsigned char)Data[i])) 245 return i; 246 return npos; 247 } 248 249 /// find_first_not_of - Find the first character in the string that is not 250 /// \arg C or npos if not found. 251 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 252 return std::string_view(*this).find_first_not_of(C, From); 253 } 254 255 /// find_first_not_of - Find the first character in the string that is not 256 /// in the string \arg Chars, or npos if not found. 257 /// 258 /// Note: O(size() + Chars.size()) 259 StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 260 size_t From) const { 261 std::bitset<1 << CHAR_BIT> CharBits; 262 for (char C : Chars) 263 CharBits.set((unsigned char)C); 264 265 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 266 if (!CharBits.test((unsigned char)Data[i])) 267 return i; 268 return npos; 269 } 270 271 /// find_last_of - Find the last character in the string that is in \arg C, 272 /// or npos if not found. 273 /// 274 /// Note: O(size() + Chars.size()) 275 StringRef::size_type StringRef::find_last_of(StringRef Chars, 276 size_t From) const { 277 std::bitset<1 << CHAR_BIT> CharBits; 278 for (char C : Chars) 279 CharBits.set((unsigned char)C); 280 281 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 282 if (CharBits.test((unsigned char)Data[i])) 283 return i; 284 return npos; 285 } 286 287 /// find_last_not_of - Find the last character in the string that is not 288 /// \arg C, or npos if not found. 289 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 290 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 291 if (Data[i] != C) 292 return i; 293 return npos; 294 } 295 296 /// find_last_not_of - Find the last character in the string that is not in 297 /// \arg Chars, or npos if not found. 298 /// 299 /// Note: O(size() + Chars.size()) 300 StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 301 size_t From) const { 302 std::bitset<1 << CHAR_BIT> CharBits; 303 for (char C : Chars) 304 CharBits.set((unsigned char)C); 305 306 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 307 if (!CharBits.test((unsigned char)Data[i])) 308 return i; 309 return npos; 310 } 311 312 void StringRef::split(SmallVectorImpl<StringRef> &A, 313 StringRef Separator, int MaxSplit, 314 bool KeepEmpty) const { 315 StringRef S = *this; 316 317 // Count down from MaxSplit. When MaxSplit is -1, this will just split 318 // "forever". This doesn't support splitting more than 2^31 times 319 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 320 // but that seems unlikely to be useful. 321 while (MaxSplit-- != 0) { 322 size_t Idx = S.find(Separator); 323 if (Idx == npos) 324 break; 325 326 // Push this split. 327 if (KeepEmpty || Idx > 0) 328 A.push_back(S.slice(0, Idx)); 329 330 // Jump forward. 331 S = S.slice(Idx + Separator.size(), npos); 332 } 333 334 // Push the tail. 335 if (KeepEmpty || !S.empty()) 336 A.push_back(S); 337 } 338 339 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, 340 int MaxSplit, bool KeepEmpty) const { 341 StringRef S = *this; 342 343 // Count down from MaxSplit. When MaxSplit is -1, this will just split 344 // "forever". This doesn't support splitting more than 2^31 times 345 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 346 // but that seems unlikely to be useful. 347 while (MaxSplit-- != 0) { 348 size_t Idx = S.find(Separator); 349 if (Idx == npos) 350 break; 351 352 // Push this split. 353 if (KeepEmpty || Idx > 0) 354 A.push_back(S.slice(0, Idx)); 355 356 // Jump forward. 357 S = S.slice(Idx + 1, npos); 358 } 359 360 // Push the tail. 361 if (KeepEmpty || !S.empty()) 362 A.push_back(S); 363 } 364 365 //===----------------------------------------------------------------------===// 366 // Helpful Algorithms 367 //===----------------------------------------------------------------------===// 368 369 /// count - Return the number of non-overlapped occurrences of \arg Str in 370 /// the string. 371 size_t StringRef::count(StringRef Str) const { 372 size_t Count = 0; 373 size_t Pos = 0; 374 size_t N = Str.size(); 375 // TODO: For an empty `Str` we return 0 for legacy reasons. Consider changing 376 // this to `Length + 1` which is more in-line with the function 377 // description. 378 if (!N) 379 return 0; 380 while ((Pos = find(Str, Pos)) != npos) { 381 ++Count; 382 Pos += N; 383 } 384 return Count; 385 } 386 387 static unsigned GetAutoSenseRadix(StringRef &Str) { 388 if (Str.empty()) 389 return 10; 390 391 if (Str.startswith("0x") || Str.startswith("0X")) { 392 Str = Str.substr(2); 393 return 16; 394 } 395 396 if (Str.startswith("0b") || Str.startswith("0B")) { 397 Str = Str.substr(2); 398 return 2; 399 } 400 401 if (Str.startswith("0o")) { 402 Str = Str.substr(2); 403 return 8; 404 } 405 406 if (Str[0] == '0' && Str.size() > 1 && isDigit(Str[1])) { 407 Str = Str.substr(1); 408 return 8; 409 } 410 411 return 10; 412 } 413 414 bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix, 415 unsigned long long &Result) { 416 // Autosense radix if not specified. 417 if (Radix == 0) 418 Radix = GetAutoSenseRadix(Str); 419 420 // Empty strings (after the radix autosense) are invalid. 421 if (Str.empty()) return true; 422 423 // Parse all the bytes of the string given this radix. Watch for overflow. 424 StringRef Str2 = Str; 425 Result = 0; 426 while (!Str2.empty()) { 427 unsigned CharVal; 428 if (Str2[0] >= '0' && Str2[0] <= '9') 429 CharVal = Str2[0] - '0'; 430 else if (Str2[0] >= 'a' && Str2[0] <= 'z') 431 CharVal = Str2[0] - 'a' + 10; 432 else if (Str2[0] >= 'A' && Str2[0] <= 'Z') 433 CharVal = Str2[0] - 'A' + 10; 434 else 435 break; 436 437 // If the parsed value is larger than the integer radix, we cannot 438 // consume any more characters. 439 if (CharVal >= Radix) 440 break; 441 442 // Add in this character. 443 unsigned long long PrevResult = Result; 444 Result = Result * Radix + CharVal; 445 446 // Check for overflow by shifting back and seeing if bits were lost. 447 if (Result / Radix < PrevResult) 448 return true; 449 450 Str2 = Str2.substr(1); 451 } 452 453 // We consider the operation a failure if no characters were consumed 454 // successfully. 455 if (Str.size() == Str2.size()) 456 return true; 457 458 Str = Str2; 459 return false; 460 } 461 462 bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix, 463 long long &Result) { 464 unsigned long long ULLVal; 465 466 // Handle positive strings first. 467 if (Str.empty() || Str.front() != '-') { 468 if (consumeUnsignedInteger(Str, Radix, ULLVal) || 469 // Check for value so large it overflows a signed value. 470 (long long)ULLVal < 0) 471 return true; 472 Result = ULLVal; 473 return false; 474 } 475 476 // Get the positive part of the value. 477 StringRef Str2 = Str.drop_front(1); 478 if (consumeUnsignedInteger(Str2, Radix, ULLVal) || 479 // Reject values so large they'd overflow as negative signed, but allow 480 // "-0". This negates the unsigned so that the negative isn't undefined 481 // on signed overflow. 482 (long long)-ULLVal > 0) 483 return true; 484 485 Str = Str2; 486 Result = -ULLVal; 487 return false; 488 } 489 490 /// GetAsUnsignedInteger - Workhorse method that converts a integer character 491 /// sequence of radix up to 36 to an unsigned long long value. 492 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 493 unsigned long long &Result) { 494 if (consumeUnsignedInteger(Str, Radix, Result)) 495 return true; 496 497 // For getAsUnsignedInteger, we require the whole string to be consumed or 498 // else we consider it a failure. 499 return !Str.empty(); 500 } 501 502 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 503 long long &Result) { 504 if (consumeSignedInteger(Str, Radix, Result)) 505 return true; 506 507 // For getAsSignedInteger, we require the whole string to be consumed or else 508 // we consider it a failure. 509 return !Str.empty(); 510 } 511 512 bool StringRef::consumeInteger(unsigned Radix, APInt &Result) { 513 StringRef Str = *this; 514 515 // Autosense radix if not specified. 516 if (Radix == 0) 517 Radix = GetAutoSenseRadix(Str); 518 519 assert(Radix > 1 && Radix <= 36); 520 521 // Empty strings (after the radix autosense) are invalid. 522 if (Str.empty()) return true; 523 524 // Skip leading zeroes. This can be a significant improvement if 525 // it means we don't need > 64 bits. 526 while (!Str.empty() && Str.front() == '0') 527 Str = Str.substr(1); 528 529 // If it was nothing but zeroes.... 530 if (Str.empty()) { 531 Result = APInt(64, 0); 532 *this = Str; 533 return false; 534 } 535 536 // (Over-)estimate the required number of bits. 537 unsigned Log2Radix = 0; 538 while ((1U << Log2Radix) < Radix) Log2Radix++; 539 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 540 541 unsigned BitWidth = Log2Radix * Str.size(); 542 if (BitWidth < Result.getBitWidth()) 543 BitWidth = Result.getBitWidth(); // don't shrink the result 544 else if (BitWidth > Result.getBitWidth()) 545 Result = Result.zext(BitWidth); 546 547 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 548 if (!IsPowerOf2Radix) { 549 // These must have the same bit-width as Result. 550 RadixAP = APInt(BitWidth, Radix); 551 CharAP = APInt(BitWidth, 0); 552 } 553 554 // Parse all the bytes of the string given this radix. 555 Result = 0; 556 while (!Str.empty()) { 557 unsigned CharVal; 558 if (Str[0] >= '0' && Str[0] <= '9') 559 CharVal = Str[0]-'0'; 560 else if (Str[0] >= 'a' && Str[0] <= 'z') 561 CharVal = Str[0]-'a'+10; 562 else if (Str[0] >= 'A' && Str[0] <= 'Z') 563 CharVal = Str[0]-'A'+10; 564 else 565 break; 566 567 // If the parsed value is larger than the integer radix, the string is 568 // invalid. 569 if (CharVal >= Radix) 570 break; 571 572 // Add in this character. 573 if (IsPowerOf2Radix) { 574 Result <<= Log2Radix; 575 Result |= CharVal; 576 } else { 577 Result *= RadixAP; 578 CharAP = CharVal; 579 Result += CharAP; 580 } 581 582 Str = Str.substr(1); 583 } 584 585 // We consider the operation a failure if no characters were consumed 586 // successfully. 587 if (size() == Str.size()) 588 return true; 589 590 *this = Str; 591 return false; 592 } 593 594 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 595 StringRef Str = *this; 596 if (Str.consumeInteger(Radix, Result)) 597 return true; 598 599 // For getAsInteger, we require the whole string to be consumed or else we 600 // consider it a failure. 601 return !Str.empty(); 602 } 603 604 bool StringRef::getAsDouble(double &Result, bool AllowInexact) const { 605 APFloat F(0.0); 606 auto StatusOrErr = F.convertFromString(*this, APFloat::rmNearestTiesToEven); 607 if (errorToBool(StatusOrErr.takeError())) 608 return true; 609 610 APFloat::opStatus Status = *StatusOrErr; 611 if (Status != APFloat::opOK) { 612 if (!AllowInexact || !(Status & APFloat::opInexact)) 613 return true; 614 } 615 616 Result = F.convertToDouble(); 617 return false; 618 } 619 620 // Implementation of StringRef hashing. 621 hash_code llvm::hash_value(StringRef S) { 622 return hash_combine_range(S.begin(), S.end()); 623 } 624 625 unsigned DenseMapInfo<StringRef, void>::getHashValue(StringRef Val) { 626 assert(Val.data() != getEmptyKey().data() && 627 "Cannot hash the empty key!"); 628 assert(Val.data() != getTombstoneKey().data() && 629 "Cannot hash the tombstone key!"); 630 return (unsigned)(hash_value(Val)); 631 } 632