1 //===-- StringRef.cpp - Lightweight String References ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/ADT/StringRef.h" 10 #include "llvm/ADT/APFloat.h" 11 #include "llvm/ADT/APInt.h" 12 #include "llvm/ADT/Hashing.h" 13 #include "llvm/ADT/StringExtras.h" 14 #include "llvm/ADT/edit_distance.h" 15 #include "llvm/Support/Error.h" 16 #include <bitset> 17 18 using namespace llvm; 19 20 // MSVC emits references to this into the translation units which reference it. 21 #ifndef _MSC_VER 22 constexpr size_t StringRef::npos; 23 #endif 24 25 // strncasecmp() is not available on non-POSIX systems, so define an 26 // alternative function here. 27 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 28 for (size_t I = 0; I < Length; ++I) { 29 unsigned char LHC = toLower(LHS[I]); 30 unsigned char RHC = toLower(RHS[I]); 31 if (LHC != RHC) 32 return LHC < RHC ? -1 : 1; 33 } 34 return 0; 35 } 36 37 int StringRef::compare_insensitive(StringRef RHS) const { 38 if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length))) 39 return Res; 40 if (Length == RHS.Length) 41 return 0; 42 return Length < RHS.Length ? -1 : 1; 43 } 44 45 bool StringRef::startswith_insensitive(StringRef Prefix) const { 46 return Length >= Prefix.Length && 47 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 48 } 49 50 bool StringRef::endswith_insensitive(StringRef Suffix) const { 51 return Length >= Suffix.Length && 52 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 53 } 54 55 size_t StringRef::find_insensitive(char C, size_t From) const { 56 char L = toLower(C); 57 return find_if([L](char D) { return toLower(D) == L; }, From); 58 } 59 60 /// compare_numeric - Compare strings, handle embedded numbers. 61 int StringRef::compare_numeric(StringRef RHS) const { 62 for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) { 63 // Check for sequences of digits. 64 if (isDigit(Data[I]) && isDigit(RHS.Data[I])) { 65 // The longer sequence of numbers is considered larger. 66 // This doesn't really handle prefixed zeros well. 67 size_t J; 68 for (J = I + 1; J != E + 1; ++J) { 69 bool ld = J < Length && isDigit(Data[J]); 70 bool rd = J < RHS.Length && isDigit(RHS.Data[J]); 71 if (ld != rd) 72 return rd ? -1 : 1; 73 if (!rd) 74 break; 75 } 76 // The two number sequences have the same length (J-I), just memcmp them. 77 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 78 return Res < 0 ? -1 : 1; 79 // Identical number sequences, continue search after the numbers. 80 I = J - 1; 81 continue; 82 } 83 if (Data[I] != RHS.Data[I]) 84 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 85 } 86 if (Length == RHS.Length) 87 return 0; 88 return Length < RHS.Length ? -1 : 1; 89 } 90 91 // Compute the edit distance between the two given strings. 92 unsigned StringRef::edit_distance(llvm::StringRef Other, 93 bool AllowReplacements, 94 unsigned MaxEditDistance) const { 95 return llvm::ComputeEditDistance( 96 makeArrayRef(data(), size()), 97 makeArrayRef(Other.data(), Other.size()), 98 AllowReplacements, MaxEditDistance); 99 } 100 101 //===----------------------------------------------------------------------===// 102 // String Operations 103 //===----------------------------------------------------------------------===// 104 105 std::string StringRef::lower() const { 106 return std::string(map_iterator(begin(), toLower), 107 map_iterator(end(), toLower)); 108 } 109 110 std::string StringRef::upper() const { 111 return std::string(map_iterator(begin(), toUpper), 112 map_iterator(end(), toUpper)); 113 } 114 115 //===----------------------------------------------------------------------===// 116 // String Searching 117 //===----------------------------------------------------------------------===// 118 119 120 /// find - Search for the first string \arg Str in the string. 121 /// 122 /// \return - The index of the first occurrence of \arg Str, or npos if not 123 /// found. 124 size_t StringRef::find(StringRef Str, size_t From) const { 125 if (From > Length) 126 return npos; 127 128 const char *Start = Data + From; 129 size_t Size = Length - From; 130 131 const char *Needle = Str.data(); 132 size_t N = Str.size(); 133 if (N == 0) 134 return From; 135 if (Size < N) 136 return npos; 137 if (N == 1) { 138 const char *Ptr = (const char *)::memchr(Start, Needle[0], Size); 139 return Ptr == nullptr ? npos : Ptr - Data; 140 } 141 142 const char *Stop = Start + (Size - N + 1); 143 144 // For short haystacks or unsupported needles fall back to the naive algorithm 145 if (Size < 16 || N > 255) { 146 do { 147 if (std::memcmp(Start, Needle, N) == 0) 148 return Start - Data; 149 ++Start; 150 } while (Start < Stop); 151 return npos; 152 } 153 154 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 155 uint8_t BadCharSkip[256]; 156 std::memset(BadCharSkip, N, 256); 157 for (unsigned i = 0; i != N-1; ++i) 158 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 159 160 do { 161 uint8_t Last = Start[N - 1]; 162 if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - 1])) 163 if (std::memcmp(Start, Needle, N - 1) == 0) 164 return Start - Data; 165 166 // Otherwise skip the appropriate number of bytes. 167 Start += BadCharSkip[Last]; 168 } while (Start < Stop); 169 170 return npos; 171 } 172 173 size_t StringRef::find_insensitive(StringRef Str, size_t From) const { 174 StringRef This = substr(From); 175 while (This.size() >= Str.size()) { 176 if (This.startswith_insensitive(Str)) 177 return From; 178 This = This.drop_front(); 179 ++From; 180 } 181 return npos; 182 } 183 184 size_t StringRef::rfind_insensitive(char C, size_t From) const { 185 From = std::min(From, Length); 186 size_t i = From; 187 while (i != 0) { 188 --i; 189 if (toLower(Data[i]) == toLower(C)) 190 return i; 191 } 192 return npos; 193 } 194 195 /// rfind - Search for the last string \arg Str in the string. 196 /// 197 /// \return - The index of the last occurrence of \arg Str, or npos if not 198 /// found. 199 size_t StringRef::rfind(StringRef Str) const { 200 size_t N = Str.size(); 201 if (N > Length) 202 return npos; 203 for (size_t i = Length - N + 1, e = 0; i != e;) { 204 --i; 205 if (substr(i, N).equals(Str)) 206 return i; 207 } 208 return npos; 209 } 210 211 size_t StringRef::rfind_insensitive(StringRef Str) const { 212 size_t N = Str.size(); 213 if (N > Length) 214 return npos; 215 for (size_t i = Length - N + 1, e = 0; i != e;) { 216 --i; 217 if (substr(i, N).equals_insensitive(Str)) 218 return i; 219 } 220 return npos; 221 } 222 223 /// find_first_of - Find the first character in the string that is in \arg 224 /// Chars, or npos if not found. 225 /// 226 /// Note: O(size() + Chars.size()) 227 StringRef::size_type StringRef::find_first_of(StringRef Chars, 228 size_t From) const { 229 std::bitset<1 << CHAR_BIT> CharBits; 230 for (char C : Chars) 231 CharBits.set((unsigned char)C); 232 233 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 234 if (CharBits.test((unsigned char)Data[i])) 235 return i; 236 return npos; 237 } 238 239 /// find_first_not_of - Find the first character in the string that is not 240 /// \arg C or npos if not found. 241 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 242 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 243 if (Data[i] != C) 244 return i; 245 return npos; 246 } 247 248 /// find_first_not_of - Find the first character in the string that is not 249 /// in the string \arg Chars, or npos if not found. 250 /// 251 /// Note: O(size() + Chars.size()) 252 StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 253 size_t From) const { 254 std::bitset<1 << CHAR_BIT> CharBits; 255 for (char C : Chars) 256 CharBits.set((unsigned char)C); 257 258 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 259 if (!CharBits.test((unsigned char)Data[i])) 260 return i; 261 return npos; 262 } 263 264 /// find_last_of - Find the last character in the string that is in \arg C, 265 /// or npos if not found. 266 /// 267 /// Note: O(size() + Chars.size()) 268 StringRef::size_type StringRef::find_last_of(StringRef Chars, 269 size_t From) const { 270 std::bitset<1 << CHAR_BIT> CharBits; 271 for (char C : Chars) 272 CharBits.set((unsigned char)C); 273 274 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 275 if (CharBits.test((unsigned char)Data[i])) 276 return i; 277 return npos; 278 } 279 280 /// find_last_not_of - Find the last character in the string that is not 281 /// \arg C, or npos if not found. 282 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 283 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 284 if (Data[i] != C) 285 return i; 286 return npos; 287 } 288 289 /// find_last_not_of - Find the last character in the string that is not in 290 /// \arg Chars, or npos if not found. 291 /// 292 /// Note: O(size() + Chars.size()) 293 StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 294 size_t From) const { 295 std::bitset<1 << CHAR_BIT> CharBits; 296 for (char C : Chars) 297 CharBits.set((unsigned char)C); 298 299 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 300 if (!CharBits.test((unsigned char)Data[i])) 301 return i; 302 return npos; 303 } 304 305 void StringRef::split(SmallVectorImpl<StringRef> &A, 306 StringRef Separator, int MaxSplit, 307 bool KeepEmpty) const { 308 StringRef S = *this; 309 310 // Count down from MaxSplit. When MaxSplit is -1, this will just split 311 // "forever". This doesn't support splitting more than 2^31 times 312 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 313 // but that seems unlikely to be useful. 314 while (MaxSplit-- != 0) { 315 size_t Idx = S.find(Separator); 316 if (Idx == npos) 317 break; 318 319 // Push this split. 320 if (KeepEmpty || Idx > 0) 321 A.push_back(S.slice(0, Idx)); 322 323 // Jump forward. 324 S = S.slice(Idx + Separator.size(), npos); 325 } 326 327 // Push the tail. 328 if (KeepEmpty || !S.empty()) 329 A.push_back(S); 330 } 331 332 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, 333 int MaxSplit, bool KeepEmpty) const { 334 StringRef S = *this; 335 336 // Count down from MaxSplit. When MaxSplit is -1, this will just split 337 // "forever". This doesn't support splitting more than 2^31 times 338 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 339 // but that seems unlikely to be useful. 340 while (MaxSplit-- != 0) { 341 size_t Idx = S.find(Separator); 342 if (Idx == npos) 343 break; 344 345 // Push this split. 346 if (KeepEmpty || Idx > 0) 347 A.push_back(S.slice(0, Idx)); 348 349 // Jump forward. 350 S = S.slice(Idx + 1, npos); 351 } 352 353 // Push the tail. 354 if (KeepEmpty || !S.empty()) 355 A.push_back(S); 356 } 357 358 //===----------------------------------------------------------------------===// 359 // Helpful Algorithms 360 //===----------------------------------------------------------------------===// 361 362 /// count - Return the number of non-overlapped occurrences of \arg Str in 363 /// the string. 364 size_t StringRef::count(StringRef Str) const { 365 size_t Count = 0; 366 size_t N = Str.size(); 367 if (!N || N > Length) 368 return 0; 369 for (size_t i = 0, e = Length - N + 1; i < e;) { 370 if (substr(i, N).equals(Str)) { 371 ++Count; 372 i += N; 373 } 374 else 375 ++i; 376 } 377 return Count; 378 } 379 380 static unsigned GetAutoSenseRadix(StringRef &Str) { 381 if (Str.empty()) 382 return 10; 383 384 if (Str.startswith("0x") || Str.startswith("0X")) { 385 Str = Str.substr(2); 386 return 16; 387 } 388 389 if (Str.startswith("0b") || Str.startswith("0B")) { 390 Str = Str.substr(2); 391 return 2; 392 } 393 394 if (Str.startswith("0o")) { 395 Str = Str.substr(2); 396 return 8; 397 } 398 399 if (Str[0] == '0' && Str.size() > 1 && isDigit(Str[1])) { 400 Str = Str.substr(1); 401 return 8; 402 } 403 404 return 10; 405 } 406 407 bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix, 408 unsigned long long &Result) { 409 // Autosense radix if not specified. 410 if (Radix == 0) 411 Radix = GetAutoSenseRadix(Str); 412 413 // Empty strings (after the radix autosense) are invalid. 414 if (Str.empty()) return true; 415 416 // Parse all the bytes of the string given this radix. Watch for overflow. 417 StringRef Str2 = Str; 418 Result = 0; 419 while (!Str2.empty()) { 420 unsigned CharVal; 421 if (Str2[0] >= '0' && Str2[0] <= '9') 422 CharVal = Str2[0] - '0'; 423 else if (Str2[0] >= 'a' && Str2[0] <= 'z') 424 CharVal = Str2[0] - 'a' + 10; 425 else if (Str2[0] >= 'A' && Str2[0] <= 'Z') 426 CharVal = Str2[0] - 'A' + 10; 427 else 428 break; 429 430 // If the parsed value is larger than the integer radix, we cannot 431 // consume any more characters. 432 if (CharVal >= Radix) 433 break; 434 435 // Add in this character. 436 unsigned long long PrevResult = Result; 437 Result = Result * Radix + CharVal; 438 439 // Check for overflow by shifting back and seeing if bits were lost. 440 if (Result / Radix < PrevResult) 441 return true; 442 443 Str2 = Str2.substr(1); 444 } 445 446 // We consider the operation a failure if no characters were consumed 447 // successfully. 448 if (Str.size() == Str2.size()) 449 return true; 450 451 Str = Str2; 452 return false; 453 } 454 455 bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix, 456 long long &Result) { 457 unsigned long long ULLVal; 458 459 // Handle positive strings first. 460 if (Str.empty() || Str.front() != '-') { 461 if (consumeUnsignedInteger(Str, Radix, ULLVal) || 462 // Check for value so large it overflows a signed value. 463 (long long)ULLVal < 0) 464 return true; 465 Result = ULLVal; 466 return false; 467 } 468 469 // Get the positive part of the value. 470 StringRef Str2 = Str.drop_front(1); 471 if (consumeUnsignedInteger(Str2, Radix, ULLVal) || 472 // Reject values so large they'd overflow as negative signed, but allow 473 // "-0". This negates the unsigned so that the negative isn't undefined 474 // on signed overflow. 475 (long long)-ULLVal > 0) 476 return true; 477 478 Str = Str2; 479 Result = -ULLVal; 480 return false; 481 } 482 483 /// GetAsUnsignedInteger - Workhorse method that converts a integer character 484 /// sequence of radix up to 36 to an unsigned long long value. 485 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 486 unsigned long long &Result) { 487 if (consumeUnsignedInteger(Str, Radix, Result)) 488 return true; 489 490 // For getAsUnsignedInteger, we require the whole string to be consumed or 491 // else we consider it a failure. 492 return !Str.empty(); 493 } 494 495 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 496 long long &Result) { 497 if (consumeSignedInteger(Str, Radix, Result)) 498 return true; 499 500 // For getAsSignedInteger, we require the whole string to be consumed or else 501 // we consider it a failure. 502 return !Str.empty(); 503 } 504 505 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 506 StringRef Str = *this; 507 508 // Autosense radix if not specified. 509 if (Radix == 0) 510 Radix = GetAutoSenseRadix(Str); 511 512 assert(Radix > 1 && Radix <= 36); 513 514 // Empty strings (after the radix autosense) are invalid. 515 if (Str.empty()) return true; 516 517 // Skip leading zeroes. This can be a significant improvement if 518 // it means we don't need > 64 bits. 519 while (!Str.empty() && Str.front() == '0') 520 Str = Str.substr(1); 521 522 // If it was nothing but zeroes.... 523 if (Str.empty()) { 524 Result = APInt(64, 0); 525 return false; 526 } 527 528 // (Over-)estimate the required number of bits. 529 unsigned Log2Radix = 0; 530 while ((1U << Log2Radix) < Radix) Log2Radix++; 531 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 532 533 unsigned BitWidth = Log2Radix * Str.size(); 534 if (BitWidth < Result.getBitWidth()) 535 BitWidth = Result.getBitWidth(); // don't shrink the result 536 else if (BitWidth > Result.getBitWidth()) 537 Result = Result.zext(BitWidth); 538 539 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 540 if (!IsPowerOf2Radix) { 541 // These must have the same bit-width as Result. 542 RadixAP = APInt(BitWidth, Radix); 543 CharAP = APInt(BitWidth, 0); 544 } 545 546 // Parse all the bytes of the string given this radix. 547 Result = 0; 548 while (!Str.empty()) { 549 unsigned CharVal; 550 if (Str[0] >= '0' && Str[0] <= '9') 551 CharVal = Str[0]-'0'; 552 else if (Str[0] >= 'a' && Str[0] <= 'z') 553 CharVal = Str[0]-'a'+10; 554 else if (Str[0] >= 'A' && Str[0] <= 'Z') 555 CharVal = Str[0]-'A'+10; 556 else 557 return true; 558 559 // If the parsed value is larger than the integer radix, the string is 560 // invalid. 561 if (CharVal >= Radix) 562 return true; 563 564 // Add in this character. 565 if (IsPowerOf2Radix) { 566 Result <<= Log2Radix; 567 Result |= CharVal; 568 } else { 569 Result *= RadixAP; 570 CharAP = CharVal; 571 Result += CharAP; 572 } 573 574 Str = Str.substr(1); 575 } 576 577 return false; 578 } 579 580 bool StringRef::getAsDouble(double &Result, bool AllowInexact) const { 581 APFloat F(0.0); 582 auto StatusOrErr = F.convertFromString(*this, APFloat::rmNearestTiesToEven); 583 if (errorToBool(StatusOrErr.takeError())) 584 return true; 585 586 APFloat::opStatus Status = *StatusOrErr; 587 if (Status != APFloat::opOK) { 588 if (!AllowInexact || !(Status & APFloat::opInexact)) 589 return true; 590 } 591 592 Result = F.convertToDouble(); 593 return false; 594 } 595 596 // Implementation of StringRef hashing. 597 hash_code llvm::hash_value(StringRef S) { 598 return hash_combine_range(S.begin(), S.end()); 599 } 600 601 unsigned DenseMapInfo<StringRef, void>::getHashValue(StringRef Val) { 602 assert(Val.data() != getEmptyKey().data() && 603 "Cannot hash the empty key!"); 604 assert(Val.data() != getTombstoneKey().data() && 605 "Cannot hash the tombstone key!"); 606 return (unsigned)(hash_value(Val)); 607 } 608