1 //===-- StringRef.cpp - Lightweight String References ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/ADT/StringRef.h" 10 #include "llvm/ADT/APFloat.h" 11 #include "llvm/ADT/APInt.h" 12 #include "llvm/ADT/Hashing.h" 13 #include "llvm/ADT/StringExtras.h" 14 #include "llvm/ADT/edit_distance.h" 15 #include "llvm/Support/Error.h" 16 #include <bitset> 17 18 using namespace llvm; 19 20 // MSVC emits references to this into the translation units which reference it. 21 #ifndef _MSC_VER 22 const size_t StringRef::npos; 23 #endif 24 25 // strncasecmp() is not available on non-POSIX systems, so define an 26 // alternative function here. 27 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 28 for (size_t I = 0; I < Length; ++I) { 29 unsigned char LHC = toLower(LHS[I]); 30 unsigned char RHC = toLower(RHS[I]); 31 if (LHC != RHC) 32 return LHC < RHC ? -1 : 1; 33 } 34 return 0; 35 } 36 37 /// compare_lower - Compare strings, ignoring case. 38 int StringRef::compare_lower(StringRef RHS) const { 39 if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length))) 40 return Res; 41 if (Length == RHS.Length) 42 return 0; 43 return Length < RHS.Length ? -1 : 1; 44 } 45 46 /// Check if this string starts with the given \p Prefix, ignoring case. 47 bool StringRef::startswith_lower(StringRef Prefix) const { 48 return Length >= Prefix.Length && 49 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 50 } 51 52 /// Check if this string ends with the given \p Suffix, ignoring case. 53 bool StringRef::endswith_lower(StringRef Suffix) const { 54 return Length >= Suffix.Length && 55 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 56 } 57 58 size_t StringRef::find_lower(char C, size_t From) const { 59 char L = toLower(C); 60 return find_if([L](char D) { return toLower(D) == L; }, From); 61 } 62 63 /// compare_numeric - Compare strings, handle embedded numbers. 64 int StringRef::compare_numeric(StringRef RHS) const { 65 for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) { 66 // Check for sequences of digits. 67 if (isDigit(Data[I]) && isDigit(RHS.Data[I])) { 68 // The longer sequence of numbers is considered larger. 69 // This doesn't really handle prefixed zeros well. 70 size_t J; 71 for (J = I + 1; J != E + 1; ++J) { 72 bool ld = J < Length && isDigit(Data[J]); 73 bool rd = J < RHS.Length && isDigit(RHS.Data[J]); 74 if (ld != rd) 75 return rd ? -1 : 1; 76 if (!rd) 77 break; 78 } 79 // The two number sequences have the same length (J-I), just memcmp them. 80 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 81 return Res < 0 ? -1 : 1; 82 // Identical number sequences, continue search after the numbers. 83 I = J - 1; 84 continue; 85 } 86 if (Data[I] != RHS.Data[I]) 87 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 88 } 89 if (Length == RHS.Length) 90 return 0; 91 return Length < RHS.Length ? -1 : 1; 92 } 93 94 // Compute the edit distance between the two given strings. 95 unsigned StringRef::edit_distance(llvm::StringRef Other, 96 bool AllowReplacements, 97 unsigned MaxEditDistance) const { 98 return llvm::ComputeEditDistance( 99 makeArrayRef(data(), size()), 100 makeArrayRef(Other.data(), Other.size()), 101 AllowReplacements, MaxEditDistance); 102 } 103 104 //===----------------------------------------------------------------------===// 105 // String Operations 106 //===----------------------------------------------------------------------===// 107 108 std::string StringRef::lower() const { 109 std::string Result(size(), char()); 110 for (size_type i = 0, e = size(); i != e; ++i) { 111 Result[i] = toLower(Data[i]); 112 } 113 return Result; 114 } 115 116 std::string StringRef::upper() const { 117 std::string Result(size(), char()); 118 for (size_type i = 0, e = size(); i != e; ++i) { 119 Result[i] = toUpper(Data[i]); 120 } 121 return Result; 122 } 123 124 //===----------------------------------------------------------------------===// 125 // String Searching 126 //===----------------------------------------------------------------------===// 127 128 129 /// find - Search for the first string \arg Str in the string. 130 /// 131 /// \return - The index of the first occurrence of \arg Str, or npos if not 132 /// found. 133 size_t StringRef::find(StringRef Str, size_t From) const { 134 if (From > Length) 135 return npos; 136 137 const char *Start = Data + From; 138 size_t Size = Length - From; 139 140 const char *Needle = Str.data(); 141 size_t N = Str.size(); 142 if (N == 0) 143 return From; 144 if (Size < N) 145 return npos; 146 if (N == 1) { 147 const char *Ptr = (const char *)::memchr(Start, Needle[0], Size); 148 return Ptr == nullptr ? npos : Ptr - Data; 149 } 150 151 const char *Stop = Start + (Size - N + 1); 152 153 // For short haystacks or unsupported needles fall back to the naive algorithm 154 if (Size < 16 || N > 255) { 155 do { 156 if (std::memcmp(Start, Needle, N) == 0) 157 return Start - Data; 158 ++Start; 159 } while (Start < Stop); 160 return npos; 161 } 162 163 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 164 uint8_t BadCharSkip[256]; 165 std::memset(BadCharSkip, N, 256); 166 for (unsigned i = 0; i != N-1; ++i) 167 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 168 169 do { 170 uint8_t Last = Start[N - 1]; 171 if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - 1])) 172 if (std::memcmp(Start, Needle, N - 1) == 0) 173 return Start - Data; 174 175 // Otherwise skip the appropriate number of bytes. 176 Start += BadCharSkip[Last]; 177 } while (Start < Stop); 178 179 return npos; 180 } 181 182 size_t StringRef::find_lower(StringRef Str, size_t From) const { 183 StringRef This = substr(From); 184 while (This.size() >= Str.size()) { 185 if (This.startswith_lower(Str)) 186 return From; 187 This = This.drop_front(); 188 ++From; 189 } 190 return npos; 191 } 192 193 size_t StringRef::rfind_lower(char C, size_t From) const { 194 From = std::min(From, Length); 195 size_t i = From; 196 while (i != 0) { 197 --i; 198 if (toLower(Data[i]) == toLower(C)) 199 return i; 200 } 201 return npos; 202 } 203 204 /// rfind - Search for the last string \arg Str in the string. 205 /// 206 /// \return - The index of the last occurrence of \arg Str, or npos if not 207 /// found. 208 size_t StringRef::rfind(StringRef Str) const { 209 size_t N = Str.size(); 210 if (N > Length) 211 return npos; 212 for (size_t i = Length - N + 1, e = 0; i != e;) { 213 --i; 214 if (substr(i, N).equals(Str)) 215 return i; 216 } 217 return npos; 218 } 219 220 size_t StringRef::rfind_lower(StringRef Str) const { 221 size_t N = Str.size(); 222 if (N > Length) 223 return npos; 224 for (size_t i = Length - N + 1, e = 0; i != e;) { 225 --i; 226 if (substr(i, N).equals_lower(Str)) 227 return i; 228 } 229 return npos; 230 } 231 232 /// find_first_of - Find the first character in the string that is in \arg 233 /// Chars, or npos if not found. 234 /// 235 /// Note: O(size() + Chars.size()) 236 StringRef::size_type StringRef::find_first_of(StringRef Chars, 237 size_t From) const { 238 std::bitset<1 << CHAR_BIT> CharBits; 239 for (size_type i = 0; i != Chars.size(); ++i) 240 CharBits.set((unsigned char)Chars[i]); 241 242 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 243 if (CharBits.test((unsigned char)Data[i])) 244 return i; 245 return npos; 246 } 247 248 /// find_first_not_of - Find the first character in the string that is not 249 /// \arg C or npos if not found. 250 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 251 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 252 if (Data[i] != C) 253 return i; 254 return npos; 255 } 256 257 /// find_first_not_of - Find the first character in the string that is not 258 /// in the string \arg Chars, or npos if not found. 259 /// 260 /// Note: O(size() + Chars.size()) 261 StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 262 size_t From) const { 263 std::bitset<1 << CHAR_BIT> CharBits; 264 for (size_type i = 0; i != Chars.size(); ++i) 265 CharBits.set((unsigned char)Chars[i]); 266 267 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 268 if (!CharBits.test((unsigned char)Data[i])) 269 return i; 270 return npos; 271 } 272 273 /// find_last_of - Find the last character in the string that is in \arg C, 274 /// or npos if not found. 275 /// 276 /// Note: O(size() + Chars.size()) 277 StringRef::size_type StringRef::find_last_of(StringRef Chars, 278 size_t From) const { 279 std::bitset<1 << CHAR_BIT> CharBits; 280 for (size_type i = 0; i != Chars.size(); ++i) 281 CharBits.set((unsigned char)Chars[i]); 282 283 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 284 if (CharBits.test((unsigned char)Data[i])) 285 return i; 286 return npos; 287 } 288 289 /// find_last_not_of - Find the last character in the string that is not 290 /// \arg C, or npos if not found. 291 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 292 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 293 if (Data[i] != C) 294 return i; 295 return npos; 296 } 297 298 /// find_last_not_of - Find the last character in the string that is not in 299 /// \arg Chars, or npos if not found. 300 /// 301 /// Note: O(size() + Chars.size()) 302 StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 303 size_t From) const { 304 std::bitset<1 << CHAR_BIT> CharBits; 305 for (size_type i = 0, e = Chars.size(); i != e; ++i) 306 CharBits.set((unsigned char)Chars[i]); 307 308 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 309 if (!CharBits.test((unsigned char)Data[i])) 310 return i; 311 return npos; 312 } 313 314 void StringRef::split(SmallVectorImpl<StringRef> &A, 315 StringRef Separator, int MaxSplit, 316 bool KeepEmpty) const { 317 StringRef S = *this; 318 319 // Count down from MaxSplit. When MaxSplit is -1, this will just split 320 // "forever". This doesn't support splitting more than 2^31 times 321 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 322 // but that seems unlikely to be useful. 323 while (MaxSplit-- != 0) { 324 size_t Idx = S.find(Separator); 325 if (Idx == npos) 326 break; 327 328 // Push this split. 329 if (KeepEmpty || Idx > 0) 330 A.push_back(S.slice(0, Idx)); 331 332 // Jump forward. 333 S = S.slice(Idx + Separator.size(), npos); 334 } 335 336 // Push the tail. 337 if (KeepEmpty || !S.empty()) 338 A.push_back(S); 339 } 340 341 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, 342 int MaxSplit, bool KeepEmpty) const { 343 StringRef S = *this; 344 345 // Count down from MaxSplit. When MaxSplit is -1, this will just split 346 // "forever". This doesn't support splitting more than 2^31 times 347 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 348 // but that seems unlikely to be useful. 349 while (MaxSplit-- != 0) { 350 size_t Idx = S.find(Separator); 351 if (Idx == npos) 352 break; 353 354 // Push this split. 355 if (KeepEmpty || Idx > 0) 356 A.push_back(S.slice(0, Idx)); 357 358 // Jump forward. 359 S = S.slice(Idx + 1, npos); 360 } 361 362 // Push the tail. 363 if (KeepEmpty || !S.empty()) 364 A.push_back(S); 365 } 366 367 //===----------------------------------------------------------------------===// 368 // Helpful Algorithms 369 //===----------------------------------------------------------------------===// 370 371 /// count - Return the number of non-overlapped occurrences of \arg Str in 372 /// the string. 373 size_t StringRef::count(StringRef Str) const { 374 size_t Count = 0; 375 size_t N = Str.size(); 376 if (!N || N > Length) 377 return 0; 378 for (size_t i = 0, e = Length - N + 1; i < e;) { 379 if (substr(i, N).equals(Str)) { 380 ++Count; 381 i += N; 382 } 383 else 384 ++i; 385 } 386 return Count; 387 } 388 389 static unsigned GetAutoSenseRadix(StringRef &Str) { 390 if (Str.empty()) 391 return 10; 392 393 if (Str.startswith("0x") || Str.startswith("0X")) { 394 Str = Str.substr(2); 395 return 16; 396 } 397 398 if (Str.startswith("0b") || Str.startswith("0B")) { 399 Str = Str.substr(2); 400 return 2; 401 } 402 403 if (Str.startswith("0o")) { 404 Str = Str.substr(2); 405 return 8; 406 } 407 408 if (Str[0] == '0' && Str.size() > 1 && isDigit(Str[1])) { 409 Str = Str.substr(1); 410 return 8; 411 } 412 413 return 10; 414 } 415 416 bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix, 417 unsigned long long &Result) { 418 // Autosense radix if not specified. 419 if (Radix == 0) 420 Radix = GetAutoSenseRadix(Str); 421 422 // Empty strings (after the radix autosense) are invalid. 423 if (Str.empty()) return true; 424 425 // Parse all the bytes of the string given this radix. Watch for overflow. 426 StringRef Str2 = Str; 427 Result = 0; 428 while (!Str2.empty()) { 429 unsigned CharVal; 430 if (Str2[0] >= '0' && Str2[0] <= '9') 431 CharVal = Str2[0] - '0'; 432 else if (Str2[0] >= 'a' && Str2[0] <= 'z') 433 CharVal = Str2[0] - 'a' + 10; 434 else if (Str2[0] >= 'A' && Str2[0] <= 'Z') 435 CharVal = Str2[0] - 'A' + 10; 436 else 437 break; 438 439 // If the parsed value is larger than the integer radix, we cannot 440 // consume any more characters. 441 if (CharVal >= Radix) 442 break; 443 444 // Add in this character. 445 unsigned long long PrevResult = Result; 446 Result = Result * Radix + CharVal; 447 448 // Check for overflow by shifting back and seeing if bits were lost. 449 if (Result / Radix < PrevResult) 450 return true; 451 452 Str2 = Str2.substr(1); 453 } 454 455 // We consider the operation a failure if no characters were consumed 456 // successfully. 457 if (Str.size() == Str2.size()) 458 return true; 459 460 Str = Str2; 461 return false; 462 } 463 464 bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix, 465 long long &Result) { 466 unsigned long long ULLVal; 467 468 // Handle positive strings first. 469 if (Str.empty() || Str.front() != '-') { 470 if (consumeUnsignedInteger(Str, Radix, ULLVal) || 471 // Check for value so large it overflows a signed value. 472 (long long)ULLVal < 0) 473 return true; 474 Result = ULLVal; 475 return false; 476 } 477 478 // Get the positive part of the value. 479 StringRef Str2 = Str.drop_front(1); 480 if (consumeUnsignedInteger(Str2, Radix, ULLVal) || 481 // Reject values so large they'd overflow as negative signed, but allow 482 // "-0". This negates the unsigned so that the negative isn't undefined 483 // on signed overflow. 484 (long long)-ULLVal > 0) 485 return true; 486 487 Str = Str2; 488 Result = -ULLVal; 489 return false; 490 } 491 492 /// GetAsUnsignedInteger - Workhorse method that converts a integer character 493 /// sequence of radix up to 36 to an unsigned long long value. 494 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 495 unsigned long long &Result) { 496 if (consumeUnsignedInteger(Str, Radix, Result)) 497 return true; 498 499 // For getAsUnsignedInteger, we require the whole string to be consumed or 500 // else we consider it a failure. 501 return !Str.empty(); 502 } 503 504 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 505 long long &Result) { 506 if (consumeSignedInteger(Str, Radix, Result)) 507 return true; 508 509 // For getAsSignedInteger, we require the whole string to be consumed or else 510 // we consider it a failure. 511 return !Str.empty(); 512 } 513 514 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 515 StringRef Str = *this; 516 517 // Autosense radix if not specified. 518 if (Radix == 0) 519 Radix = GetAutoSenseRadix(Str); 520 521 assert(Radix > 1 && Radix <= 36); 522 523 // Empty strings (after the radix autosense) are invalid. 524 if (Str.empty()) return true; 525 526 // Skip leading zeroes. This can be a significant improvement if 527 // it means we don't need > 64 bits. 528 while (!Str.empty() && Str.front() == '0') 529 Str = Str.substr(1); 530 531 // If it was nothing but zeroes.... 532 if (Str.empty()) { 533 Result = APInt(64, 0); 534 return false; 535 } 536 537 // (Over-)estimate the required number of bits. 538 unsigned Log2Radix = 0; 539 while ((1U << Log2Radix) < Radix) Log2Radix++; 540 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 541 542 unsigned BitWidth = Log2Radix * Str.size(); 543 if (BitWidth < Result.getBitWidth()) 544 BitWidth = Result.getBitWidth(); // don't shrink the result 545 else if (BitWidth > Result.getBitWidth()) 546 Result = Result.zext(BitWidth); 547 548 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 549 if (!IsPowerOf2Radix) { 550 // These must have the same bit-width as Result. 551 RadixAP = APInt(BitWidth, Radix); 552 CharAP = APInt(BitWidth, 0); 553 } 554 555 // Parse all the bytes of the string given this radix. 556 Result = 0; 557 while (!Str.empty()) { 558 unsigned CharVal; 559 if (Str[0] >= '0' && Str[0] <= '9') 560 CharVal = Str[0]-'0'; 561 else if (Str[0] >= 'a' && Str[0] <= 'z') 562 CharVal = Str[0]-'a'+10; 563 else if (Str[0] >= 'A' && Str[0] <= 'Z') 564 CharVal = Str[0]-'A'+10; 565 else 566 return true; 567 568 // If the parsed value is larger than the integer radix, the string is 569 // invalid. 570 if (CharVal >= Radix) 571 return true; 572 573 // Add in this character. 574 if (IsPowerOf2Radix) { 575 Result <<= Log2Radix; 576 Result |= CharVal; 577 } else { 578 Result *= RadixAP; 579 CharAP = CharVal; 580 Result += CharAP; 581 } 582 583 Str = Str.substr(1); 584 } 585 586 return false; 587 } 588 589 bool StringRef::getAsDouble(double &Result, bool AllowInexact) const { 590 APFloat F(0.0); 591 auto StatusOrErr = F.convertFromString(*this, APFloat::rmNearestTiesToEven); 592 if (errorToBool(StatusOrErr.takeError())) 593 return true; 594 595 APFloat::opStatus Status = *StatusOrErr; 596 if (Status != APFloat::opOK) { 597 if (!AllowInexact || !(Status & APFloat::opInexact)) 598 return true; 599 } 600 601 Result = F.convertToDouble(); 602 return false; 603 } 604 605 // Implementation of StringRef hashing. 606 hash_code llvm::hash_value(StringRef S) { 607 return hash_combine_range(S.begin(), S.end()); 608 } 609