1 //===-- StringRef.cpp - Lightweight String References ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/ADT/StringRef.h" 10 #include "llvm/ADT/APFloat.h" 11 #include "llvm/ADT/APInt.h" 12 #include "llvm/ADT/Hashing.h" 13 #include "llvm/ADT/StringExtras.h" 14 #include "llvm/ADT/edit_distance.h" 15 #include "llvm/Support/Error.h" 16 #include <bitset> 17 18 using namespace llvm; 19 20 // MSVC emits references to this into the translation units which reference it. 21 #ifndef _MSC_VER 22 constexpr size_t StringRef::npos; 23 #endif 24 25 // strncasecmp() is not available on non-POSIX systems, so define an 26 // alternative function here. 27 static int ascii_strncasecmp(StringRef LHS, StringRef RHS) { 28 for (auto [LC, RC] : zip_equal(LHS, RHS)) { 29 unsigned char LHC = toLower(LC); 30 unsigned char RHC = toLower(RC); 31 if (LHC != RHC) 32 return LHC < RHC ? -1 : 1; 33 } 34 return 0; 35 } 36 37 int StringRef::compare_insensitive(StringRef RHS) const { 38 size_t Min = std::min(size(), RHS.size()); 39 if (int Res = ascii_strncasecmp(take_front(Min), RHS.take_front(Min))) 40 return Res; 41 if (size() == RHS.size()) 42 return 0; 43 return size() < RHS.size() ? -1 : 1; 44 } 45 46 bool StringRef::starts_with_insensitive(StringRef Prefix) const { 47 return size() >= Prefix.size() && 48 ascii_strncasecmp(take_front(Prefix.size()), Prefix) == 0; 49 } 50 51 bool StringRef::ends_with_insensitive(StringRef Suffix) const { 52 return size() >= Suffix.size() && 53 ascii_strncasecmp(take_back(Suffix.size()), Suffix) == 0; 54 } 55 56 size_t StringRef::find_insensitive(char C, size_t From) const { 57 char L = toLower(C); 58 return find_if([L](char D) { return toLower(D) == L; }, From); 59 } 60 61 /// compare_numeric - Compare strings, handle embedded numbers. 62 int StringRef::compare_numeric(StringRef RHS) const { 63 for (size_t I = 0, E = std::min(size(), RHS.size()); I != E; ++I) { 64 // Check for sequences of digits. 65 if (isDigit(data()[I]) && isDigit(RHS.data()[I])) { 66 // The longer sequence of numbers is considered larger. 67 // This doesn't really handle prefixed zeros well. 68 size_t J; 69 for (J = I + 1; J != E + 1; ++J) { 70 bool ld = J < size() && isDigit(data()[J]); 71 bool rd = J < RHS.size() && isDigit(RHS.data()[J]); 72 if (ld != rd) 73 return rd ? -1 : 1; 74 if (!rd) 75 break; 76 } 77 // The two number sequences have the same length (J-I), just memcmp them. 78 if (int Res = compareMemory(data() + I, RHS.data() + I, J - I)) 79 return Res < 0 ? -1 : 1; 80 // Identical number sequences, continue search after the numbers. 81 I = J - 1; 82 continue; 83 } 84 if (data()[I] != RHS.data()[I]) 85 return (unsigned char)data()[I] < (unsigned char)RHS.data()[I] ? -1 : 1; 86 } 87 if (size() == RHS.size()) 88 return 0; 89 return size() < RHS.size() ? -1 : 1; 90 } 91 92 // Compute the edit distance between the two given strings. 93 unsigned StringRef::edit_distance(llvm::StringRef Other, 94 bool AllowReplacements, 95 unsigned MaxEditDistance) const { 96 return llvm::ComputeEditDistance(ArrayRef(data(), size()), 97 ArrayRef(Other.data(), Other.size()), 98 AllowReplacements, MaxEditDistance); 99 } 100 101 unsigned llvm::StringRef::edit_distance_insensitive( 102 StringRef Other, bool AllowReplacements, unsigned MaxEditDistance) const { 103 return llvm::ComputeMappedEditDistance( 104 ArrayRef(data(), size()), ArrayRef(Other.data(), Other.size()), 105 llvm::toLower, AllowReplacements, MaxEditDistance); 106 } 107 108 //===----------------------------------------------------------------------===// 109 // String Operations 110 //===----------------------------------------------------------------------===// 111 112 std::string StringRef::lower() const { 113 return std::string(map_iterator(begin(), toLower), 114 map_iterator(end(), toLower)); 115 } 116 117 std::string StringRef::upper() const { 118 return std::string(map_iterator(begin(), toUpper), 119 map_iterator(end(), toUpper)); 120 } 121 122 //===----------------------------------------------------------------------===// 123 // String Searching 124 //===----------------------------------------------------------------------===// 125 126 127 /// find - Search for the first string \arg Str in the string. 128 /// 129 /// \return - The index of the first occurrence of \arg Str, or npos if not 130 /// found. 131 size_t StringRef::find(StringRef Str, size_t From) const { 132 if (From > size()) 133 return npos; 134 135 const char *Start = data() + From; 136 size_t Size = size() - From; 137 138 const char *Needle = Str.data(); 139 size_t N = Str.size(); 140 if (N == 0) 141 return From; 142 if (Size < N) 143 return npos; 144 if (N == 1) { 145 const char *Ptr = (const char *)::memchr(Start, Needle[0], Size); 146 return Ptr == nullptr ? npos : Ptr - data(); 147 } 148 149 const char *Stop = Start + (Size - N + 1); 150 151 if (N == 2) { 152 // Provide a fast path for newline finding (CRLF case) in InclusionRewriter. 153 // Not the most optimized strategy, but getting memcmp inlined should be 154 // good enough. 155 do { 156 if (std::memcmp(Start, Needle, 2) == 0) 157 return Start - data(); 158 ++Start; 159 } while (Start < Stop); 160 return npos; 161 } 162 163 // For short haystacks or unsupported needles fall back to the naive algorithm 164 if (Size < 16 || N > 255) { 165 do { 166 if (std::memcmp(Start, Needle, N) == 0) 167 return Start - data(); 168 ++Start; 169 } while (Start < Stop); 170 return npos; 171 } 172 173 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 174 uint8_t BadCharSkip[256]; 175 std::memset(BadCharSkip, N, 256); 176 for (unsigned i = 0; i != N-1; ++i) 177 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 178 179 do { 180 uint8_t Last = Start[N - 1]; 181 if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - 1])) 182 if (std::memcmp(Start, Needle, N - 1) == 0) 183 return Start - data(); 184 185 // Otherwise skip the appropriate number of bytes. 186 Start += BadCharSkip[Last]; 187 } while (Start < Stop); 188 189 return npos; 190 } 191 192 size_t StringRef::find_insensitive(StringRef Str, size_t From) const { 193 StringRef This = substr(From); 194 while (This.size() >= Str.size()) { 195 if (This.starts_with_insensitive(Str)) 196 return From; 197 This = This.drop_front(); 198 ++From; 199 } 200 return npos; 201 } 202 203 size_t StringRef::rfind_insensitive(char C, size_t From) const { 204 From = std::min(From, size()); 205 size_t i = From; 206 while (i != 0) { 207 --i; 208 if (toLower(data()[i]) == toLower(C)) 209 return i; 210 } 211 return npos; 212 } 213 214 /// rfind - Search for the last string \arg Str in the string. 215 /// 216 /// \return - The index of the last occurrence of \arg Str, or npos if not 217 /// found. 218 size_t StringRef::rfind(StringRef Str) const { 219 return std::string_view(*this).rfind(Str); 220 } 221 222 size_t StringRef::rfind_insensitive(StringRef Str) const { 223 size_t N = Str.size(); 224 if (N > size()) 225 return npos; 226 for (size_t i = size() - N + 1, e = 0; i != e;) { 227 --i; 228 if (substr(i, N).equals_insensitive(Str)) 229 return i; 230 } 231 return npos; 232 } 233 234 /// find_first_of - Find the first character in the string that is in \arg 235 /// Chars, or npos if not found. 236 /// 237 /// Note: O(size() + Chars.size()) 238 StringRef::size_type StringRef::find_first_of(StringRef Chars, 239 size_t From) const { 240 std::bitset<1 << CHAR_BIT> CharBits; 241 for (char C : Chars) 242 CharBits.set((unsigned char)C); 243 244 for (size_type i = std::min(From, size()), e = size(); i != e; ++i) 245 if (CharBits.test((unsigned char)data()[i])) 246 return i; 247 return npos; 248 } 249 250 /// find_first_not_of - Find the first character in the string that is not 251 /// \arg C or npos if not found. 252 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 253 return std::string_view(*this).find_first_not_of(C, From); 254 } 255 256 /// find_first_not_of - Find the first character in the string that is not 257 /// in the string \arg Chars, or npos if not found. 258 /// 259 /// Note: O(size() + Chars.size()) 260 StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 261 size_t From) const { 262 std::bitset<1 << CHAR_BIT> CharBits; 263 for (char C : Chars) 264 CharBits.set((unsigned char)C); 265 266 for (size_type i = std::min(From, size()), e = size(); i != e; ++i) 267 if (!CharBits.test((unsigned char)data()[i])) 268 return i; 269 return npos; 270 } 271 272 /// find_last_of - Find the last character in the string that is in \arg C, 273 /// or npos if not found. 274 /// 275 /// Note: O(size() + Chars.size()) 276 StringRef::size_type StringRef::find_last_of(StringRef Chars, 277 size_t From) const { 278 std::bitset<1 << CHAR_BIT> CharBits; 279 for (char C : Chars) 280 CharBits.set((unsigned char)C); 281 282 for (size_type i = std::min(From, size()) - 1, e = -1; i != e; --i) 283 if (CharBits.test((unsigned char)data()[i])) 284 return i; 285 return npos; 286 } 287 288 /// find_last_not_of - Find the last character in the string that is not 289 /// \arg C, or npos if not found. 290 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 291 for (size_type i = std::min(From, size()) - 1, e = -1; i != e; --i) 292 if (data()[i] != C) 293 return i; 294 return npos; 295 } 296 297 /// find_last_not_of - Find the last character in the string that is not in 298 /// \arg Chars, or npos if not found. 299 /// 300 /// Note: O(size() + Chars.size()) 301 StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 302 size_t From) const { 303 std::bitset<1 << CHAR_BIT> CharBits; 304 for (char C : Chars) 305 CharBits.set((unsigned char)C); 306 307 for (size_type i = std::min(From, size()) - 1, e = -1; i != e; --i) 308 if (!CharBits.test((unsigned char)data()[i])) 309 return i; 310 return npos; 311 } 312 313 void StringRef::split(SmallVectorImpl<StringRef> &A, 314 StringRef Separator, int MaxSplit, 315 bool KeepEmpty) const { 316 StringRef S = *this; 317 318 // Count down from MaxSplit. When MaxSplit is -1, this will just split 319 // "forever". This doesn't support splitting more than 2^31 times 320 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 321 // but that seems unlikely to be useful. 322 while (MaxSplit-- != 0) { 323 size_t Idx = S.find(Separator); 324 if (Idx == npos) 325 break; 326 327 // Push this split. 328 if (KeepEmpty || Idx > 0) 329 A.push_back(S.slice(0, Idx)); 330 331 // Jump forward. 332 S = S.substr(Idx + Separator.size()); 333 } 334 335 // Push the tail. 336 if (KeepEmpty || !S.empty()) 337 A.push_back(S); 338 } 339 340 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, 341 int MaxSplit, bool KeepEmpty) const { 342 StringRef S = *this; 343 344 // Count down from MaxSplit. When MaxSplit is -1, this will just split 345 // "forever". This doesn't support splitting more than 2^31 times 346 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 347 // but that seems unlikely to be useful. 348 while (MaxSplit-- != 0) { 349 size_t Idx = S.find(Separator); 350 if (Idx == npos) 351 break; 352 353 // Push this split. 354 if (KeepEmpty || Idx > 0) 355 A.push_back(S.slice(0, Idx)); 356 357 // Jump forward. 358 S = S.substr(Idx + 1); 359 } 360 361 // Push the tail. 362 if (KeepEmpty || !S.empty()) 363 A.push_back(S); 364 } 365 366 //===----------------------------------------------------------------------===// 367 // Helpful Algorithms 368 //===----------------------------------------------------------------------===// 369 370 /// count - Return the number of non-overlapped occurrences of \arg Str in 371 /// the string. 372 size_t StringRef::count(StringRef Str) const { 373 size_t Count = 0; 374 size_t Pos = 0; 375 size_t N = Str.size(); 376 // TODO: For an empty `Str` we return 0 for legacy reasons. Consider changing 377 // this to `Length + 1` which is more in-line with the function 378 // description. 379 if (!N) 380 return 0; 381 while ((Pos = find(Str, Pos)) != npos) { 382 ++Count; 383 Pos += N; 384 } 385 return Count; 386 } 387 388 static unsigned GetAutoSenseRadix(StringRef &Str) { 389 if (Str.empty()) 390 return 10; 391 392 if (Str.consume_front_insensitive("0x")) 393 return 16; 394 395 if (Str.consume_front_insensitive("0b")) 396 return 2; 397 398 if (Str.consume_front("0o")) 399 return 8; 400 401 if (Str[0] == '0' && Str.size() > 1 && isDigit(Str[1])) { 402 Str = Str.substr(1); 403 return 8; 404 } 405 406 return 10; 407 } 408 409 bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix, 410 unsigned long long &Result) { 411 // Autosense radix if not specified. 412 if (Radix == 0) 413 Radix = GetAutoSenseRadix(Str); 414 415 // Empty strings (after the radix autosense) are invalid. 416 if (Str.empty()) return true; 417 418 // Parse all the bytes of the string given this radix. Watch for overflow. 419 StringRef Str2 = Str; 420 Result = 0; 421 while (!Str2.empty()) { 422 unsigned CharVal; 423 if (Str2[0] >= '0' && Str2[0] <= '9') 424 CharVal = Str2[0] - '0'; 425 else if (Str2[0] >= 'a' && Str2[0] <= 'z') 426 CharVal = Str2[0] - 'a' + 10; 427 else if (Str2[0] >= 'A' && Str2[0] <= 'Z') 428 CharVal = Str2[0] - 'A' + 10; 429 else 430 break; 431 432 // If the parsed value is larger than the integer radix, we cannot 433 // consume any more characters. 434 if (CharVal >= Radix) 435 break; 436 437 // Add in this character. 438 unsigned long long PrevResult = Result; 439 Result = Result * Radix + CharVal; 440 441 // Check for overflow by shifting back and seeing if bits were lost. 442 if (Result / Radix < PrevResult) 443 return true; 444 445 Str2 = Str2.substr(1); 446 } 447 448 // We consider the operation a failure if no characters were consumed 449 // successfully. 450 if (Str.size() == Str2.size()) 451 return true; 452 453 Str = Str2; 454 return false; 455 } 456 457 bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix, 458 long long &Result) { 459 unsigned long long ULLVal; 460 461 // Handle positive strings first. 462 if (!Str.starts_with("-")) { 463 if (consumeUnsignedInteger(Str, Radix, ULLVal) || 464 // Check for value so large it overflows a signed value. 465 (long long)ULLVal < 0) 466 return true; 467 Result = ULLVal; 468 return false; 469 } 470 471 // Get the positive part of the value. 472 StringRef Str2 = Str.drop_front(1); 473 if (consumeUnsignedInteger(Str2, Radix, ULLVal) || 474 // Reject values so large they'd overflow as negative signed, but allow 475 // "-0". This negates the unsigned so that the negative isn't undefined 476 // on signed overflow. 477 (long long)-ULLVal > 0) 478 return true; 479 480 Str = Str2; 481 Result = -ULLVal; 482 return false; 483 } 484 485 /// GetAsUnsignedInteger - Workhorse method that converts a integer character 486 /// sequence of radix up to 36 to an unsigned long long value. 487 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 488 unsigned long long &Result) { 489 if (consumeUnsignedInteger(Str, Radix, Result)) 490 return true; 491 492 // For getAsUnsignedInteger, we require the whole string to be consumed or 493 // else we consider it a failure. 494 return !Str.empty(); 495 } 496 497 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 498 long long &Result) { 499 if (consumeSignedInteger(Str, Radix, Result)) 500 return true; 501 502 // For getAsSignedInteger, we require the whole string to be consumed or else 503 // we consider it a failure. 504 return !Str.empty(); 505 } 506 507 bool StringRef::consumeInteger(unsigned Radix, APInt &Result) { 508 StringRef Str = *this; 509 510 // Autosense radix if not specified. 511 if (Radix == 0) 512 Radix = GetAutoSenseRadix(Str); 513 514 assert(Radix > 1 && Radix <= 36); 515 516 // Empty strings (after the radix autosense) are invalid. 517 if (Str.empty()) return true; 518 519 // Skip leading zeroes. This can be a significant improvement if 520 // it means we don't need > 64 bits. 521 Str = Str.ltrim('0'); 522 523 // If it was nothing but zeroes.... 524 if (Str.empty()) { 525 Result = APInt(64, 0); 526 *this = Str; 527 return false; 528 } 529 530 // (Over-)estimate the required number of bits. 531 unsigned Log2Radix = 0; 532 while ((1U << Log2Radix) < Radix) Log2Radix++; 533 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 534 535 unsigned BitWidth = Log2Radix * Str.size(); 536 if (BitWidth < Result.getBitWidth()) 537 BitWidth = Result.getBitWidth(); // don't shrink the result 538 else if (BitWidth > Result.getBitWidth()) 539 Result = Result.zext(BitWidth); 540 541 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 542 if (!IsPowerOf2Radix) { 543 // These must have the same bit-width as Result. 544 RadixAP = APInt(BitWidth, Radix); 545 CharAP = APInt(BitWidth, 0); 546 } 547 548 // Parse all the bytes of the string given this radix. 549 Result = 0; 550 while (!Str.empty()) { 551 unsigned CharVal; 552 if (Str[0] >= '0' && Str[0] <= '9') 553 CharVal = Str[0]-'0'; 554 else if (Str[0] >= 'a' && Str[0] <= 'z') 555 CharVal = Str[0]-'a'+10; 556 else if (Str[0] >= 'A' && Str[0] <= 'Z') 557 CharVal = Str[0]-'A'+10; 558 else 559 break; 560 561 // If the parsed value is larger than the integer radix, the string is 562 // invalid. 563 if (CharVal >= Radix) 564 break; 565 566 // Add in this character. 567 if (IsPowerOf2Radix) { 568 Result <<= Log2Radix; 569 Result |= CharVal; 570 } else { 571 Result *= RadixAP; 572 CharAP = CharVal; 573 Result += CharAP; 574 } 575 576 Str = Str.substr(1); 577 } 578 579 // We consider the operation a failure if no characters were consumed 580 // successfully. 581 if (size() == Str.size()) 582 return true; 583 584 *this = Str; 585 return false; 586 } 587 588 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 589 StringRef Str = *this; 590 if (Str.consumeInteger(Radix, Result)) 591 return true; 592 593 // For getAsInteger, we require the whole string to be consumed or else we 594 // consider it a failure. 595 return !Str.empty(); 596 } 597 598 bool StringRef::getAsDouble(double &Result, bool AllowInexact) const { 599 APFloat F(0.0); 600 auto StatusOrErr = F.convertFromString(*this, APFloat::rmNearestTiesToEven); 601 if (errorToBool(StatusOrErr.takeError())) 602 return true; 603 604 APFloat::opStatus Status = *StatusOrErr; 605 if (Status != APFloat::opOK) { 606 if (!AllowInexact || !(Status & APFloat::opInexact)) 607 return true; 608 } 609 610 Result = F.convertToDouble(); 611 return false; 612 } 613 614 // Implementation of StringRef hashing. 615 hash_code llvm::hash_value(StringRef S) { return hash_combine_range(S); } 616 617 unsigned DenseMapInfo<StringRef, void>::getHashValue(StringRef Val) { 618 assert(Val.data() != getEmptyKey().data() && 619 "Cannot hash the empty key!"); 620 assert(Val.data() != getTombstoneKey().data() && 621 "Cannot hash the tombstone key!"); 622 return (unsigned)(hash_value(Val)); 623 } 624