1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This class implements the lexer for assembly files. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/MC/MCParser/AsmLexer.h" 14 #include "llvm/ADT/APInt.h" 15 #include "llvm/ADT/ArrayRef.h" 16 #include "llvm/ADT/StringExtras.h" 17 #include "llvm/ADT/StringRef.h" 18 #include "llvm/ADT/StringSwitch.h" 19 #include "llvm/MC/MCAsmInfo.h" 20 #include "llvm/MC/MCParser/MCAsmLexer.h" 21 #include "llvm/Support/Compiler.h" 22 #include "llvm/Support/SMLoc.h" 23 #include "llvm/Support/SaveAndRestore.h" 24 #include <cassert> 25 #include <cctype> 26 #include <cstdio> 27 #include <cstring> 28 #include <string> 29 #include <tuple> 30 #include <utility> 31 32 using namespace llvm; 33 34 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) { 35 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@"); 36 } 37 38 AsmLexer::~AsmLexer() = default; 39 40 void AsmLexer::setBuffer(StringRef Buf, const char *ptr, 41 bool EndStatementAtEOF) { 42 CurBuf = Buf; 43 44 if (ptr) 45 CurPtr = ptr; 46 else 47 CurPtr = CurBuf.begin(); 48 49 TokStart = nullptr; 50 this->EndStatementAtEOF = EndStatementAtEOF; 51 } 52 53 /// ReturnError - Set the error to the specified string at the specified 54 /// location. This is defined to always return AsmToken::Error. 55 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 56 SetError(SMLoc::getFromPointer(Loc), Msg); 57 58 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc)); 59 } 60 61 int AsmLexer::getNextChar() { 62 if (CurPtr == CurBuf.end()) 63 return EOF; 64 return (unsigned char)*CurPtr++; 65 } 66 67 int AsmLexer::peekNextChar() { 68 if (CurPtr == CurBuf.end()) 69 return EOF; 70 return (unsigned char)*CurPtr; 71 } 72 73 /// The leading integral digit sequence and dot should have already been 74 /// consumed, some or all of the fractional digit sequence *can* have been 75 /// consumed. 76 AsmToken AsmLexer::LexFloatLiteral() { 77 // Skip the fractional digit sequence. 78 while (isDigit(*CurPtr)) 79 ++CurPtr; 80 81 if (*CurPtr == '-' || *CurPtr == '+') 82 return ReturnError(CurPtr, "Invalid sign in float literal"); 83 84 // Check for exponent 85 if ((*CurPtr == 'e' || *CurPtr == 'E')) { 86 ++CurPtr; 87 88 if (*CurPtr == '-' || *CurPtr == '+') 89 ++CurPtr; 90 91 while (isDigit(*CurPtr)) 92 ++CurPtr; 93 } 94 95 return AsmToken(AsmToken::Real, 96 StringRef(TokStart, CurPtr - TokStart)); 97 } 98 99 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+ 100 /// while making sure there are enough actual digits around for the constant to 101 /// be valid. 102 /// 103 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed 104 /// before we get here. 105 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) { 106 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') && 107 "unexpected parse state in floating hex"); 108 bool NoFracDigits = true; 109 110 // Skip the fractional part if there is one 111 if (*CurPtr == '.') { 112 ++CurPtr; 113 114 const char *FracStart = CurPtr; 115 while (isHexDigit(*CurPtr)) 116 ++CurPtr; 117 118 NoFracDigits = CurPtr == FracStart; 119 } 120 121 if (NoIntDigits && NoFracDigits) 122 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 123 "expected at least one significand digit"); 124 125 // Make sure we do have some kind of proper exponent part 126 if (*CurPtr != 'p' && *CurPtr != 'P') 127 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 128 "expected exponent part 'p'"); 129 ++CurPtr; 130 131 if (*CurPtr == '+' || *CurPtr == '-') 132 ++CurPtr; 133 134 // N.b. exponent digits are *not* hex 135 const char *ExpStart = CurPtr; 136 while (isDigit(*CurPtr)) 137 ++CurPtr; 138 139 if (CurPtr == ExpStart) 140 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 141 "expected at least one exponent digit"); 142 143 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); 144 } 145 146 /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]* 147 static bool IsIdentifierChar(char c, bool AllowAt) { 148 return isAlnum(c) || c == '_' || c == '$' || c == '.' || 149 (c == '@' && AllowAt) || c == '?'; 150 } 151 152 AsmToken AsmLexer::LexIdentifier() { 153 // Check for floating point literals. 154 if (CurPtr[-1] == '.' && isDigit(*CurPtr)) { 155 // Disambiguate a .1243foo identifier from a floating literal. 156 while (isDigit(*CurPtr)) 157 ++CurPtr; 158 159 if (!IsIdentifierChar(*CurPtr, AllowAtInIdentifier) || 160 *CurPtr == 'e' || *CurPtr == 'E') 161 return LexFloatLiteral(); 162 } 163 164 while (IsIdentifierChar(*CurPtr, AllowAtInIdentifier)) 165 ++CurPtr; 166 167 // Handle . as a special case. 168 if (CurPtr == TokStart+1 && TokStart[0] == '.') 169 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 170 171 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 172 } 173 174 /// LexSlash: Slash: / 175 /// C-Style Comment: /* ... */ 176 AsmToken AsmLexer::LexSlash() { 177 switch (*CurPtr) { 178 case '*': 179 IsAtStartOfStatement = false; 180 break; // C style comment. 181 case '/': 182 ++CurPtr; 183 return LexLineComment(); 184 default: 185 IsAtStartOfStatement = false; 186 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1)); 187 } 188 189 // C Style comment. 190 ++CurPtr; // skip the star. 191 const char *CommentTextStart = CurPtr; 192 while (CurPtr != CurBuf.end()) { 193 switch (*CurPtr++) { 194 case '*': 195 // End of the comment? 196 if (*CurPtr != '/') 197 break; 198 // If we have a CommentConsumer, notify it about the comment. 199 if (CommentConsumer) { 200 CommentConsumer->HandleComment( 201 SMLoc::getFromPointer(CommentTextStart), 202 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart)); 203 } 204 ++CurPtr; // End the */. 205 return AsmToken(AsmToken::Comment, 206 StringRef(TokStart, CurPtr - TokStart)); 207 } 208 } 209 return ReturnError(TokStart, "unterminated comment"); 210 } 211 212 /// LexLineComment: Comment: #[^\n]* 213 /// : //[^\n]* 214 AsmToken AsmLexer::LexLineComment() { 215 // Mark This as an end of statement with a body of the 216 // comment. While it would be nicer to leave this two tokens, 217 // backwards compatability with TargetParsers makes keeping this in this form 218 // better. 219 const char *CommentTextStart = CurPtr; 220 int CurChar = getNextChar(); 221 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) 222 CurChar = getNextChar(); 223 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n') 224 ++CurPtr; 225 226 // If we have a CommentConsumer, notify it about the comment. 227 if (CommentConsumer) { 228 CommentConsumer->HandleComment( 229 SMLoc::getFromPointer(CommentTextStart), 230 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart)); 231 } 232 233 IsAtStartOfLine = true; 234 // This is a whole line comment. leave newline 235 if (IsAtStartOfStatement) 236 return AsmToken(AsmToken::EndOfStatement, 237 StringRef(TokStart, CurPtr - TokStart)); 238 IsAtStartOfStatement = true; 239 240 return AsmToken(AsmToken::EndOfStatement, 241 StringRef(TokStart, CurPtr - 1 - TokStart)); 242 } 243 244 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 245 // Skip ULL, UL, U, L and LL suffices. 246 if (CurPtr[0] == 'U') 247 ++CurPtr; 248 if (CurPtr[0] == 'L') 249 ++CurPtr; 250 if (CurPtr[0] == 'L') 251 ++CurPtr; 252 } 253 254 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the 255 // integer as a hexadecimal, possibly with leading zeroes. 256 static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix, 257 bool LexHex) { 258 const char *FirstNonDec = nullptr; 259 const char *LookAhead = CurPtr; 260 while (true) { 261 if (isDigit(*LookAhead)) { 262 ++LookAhead; 263 } else { 264 if (!FirstNonDec) 265 FirstNonDec = LookAhead; 266 267 // Keep going if we are looking for a 'h' suffix. 268 if (LexHex && isHexDigit(*LookAhead)) 269 ++LookAhead; 270 else 271 break; 272 } 273 } 274 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H'); 275 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec; 276 if (isHex) 277 return 16; 278 return DefaultRadix; 279 } 280 281 static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) { 282 while (hexDigitValue(*CurPtr) < DefaultRadix) { 283 ++CurPtr; 284 } 285 return CurPtr; 286 } 287 288 static AsmToken intToken(StringRef Ref, APInt &Value) { 289 if (Value.isIntN(64)) 290 return AsmToken(AsmToken::Integer, Ref, Value); 291 return AsmToken(AsmToken::BigNum, Ref, Value); 292 } 293 294 static std::string radixName(unsigned Radix) { 295 switch (Radix) { 296 case 2: 297 return "binary"; 298 case 8: 299 return "octal"; 300 case 10: 301 return "decimal"; 302 case 16: 303 return "hexadecimal"; 304 default: 305 return "base-" + std::to_string(Radix); 306 } 307 } 308 309 /// LexDigit: First character is [0-9]. 310 /// Local Label: [0-9][:] 311 /// Forward/Backward Label: [0-9][fb] 312 /// Binary integer: 0b[01]+ 313 /// Octal integer: 0[0-7]+ 314 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH] 315 /// Decimal integer: [1-9][0-9]* 316 AsmToken AsmLexer::LexDigit() { 317 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY]) 318 // MASM-flavor octal integer: [0-7]+[oOqQ] 319 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT]) 320 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH] 321 if (LexMasmIntegers && isdigit(CurPtr[-1])) { 322 const char *FirstNonBinary = 323 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr; 324 const char *FirstNonDecimal = 325 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr; 326 const char *OldCurPtr = CurPtr; 327 while (isHexDigit(*CurPtr)) { 328 switch (*CurPtr) { 329 default: 330 if (!FirstNonDecimal) { 331 FirstNonDecimal = CurPtr; 332 } 333 LLVM_FALLTHROUGH; 334 case '9': 335 case '8': 336 case '7': 337 case '6': 338 case '5': 339 case '4': 340 case '3': 341 case '2': 342 if (!FirstNonBinary) { 343 FirstNonBinary = CurPtr; 344 } 345 break; 346 case '1': 347 case '0': 348 break; 349 } 350 ++CurPtr; 351 } 352 if (*CurPtr == '.') { 353 // MASM float literals (other than hex floats) always contain a ".", and 354 // are always written in decimal. 355 ++CurPtr; 356 return LexFloatLiteral(); 357 } 358 359 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) { 360 ++CurPtr; 361 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); 362 } 363 364 unsigned Radix = 0; 365 if (*CurPtr == 'h' || *CurPtr == 'H') { 366 // hexadecimal number 367 ++CurPtr; 368 Radix = 16; 369 } else if (*CurPtr == 't' || *CurPtr == 'T') { 370 // decimal number 371 ++CurPtr; 372 Radix = 10; 373 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' || 374 *CurPtr == 'Q') { 375 // octal number 376 ++CurPtr; 377 Radix = 8; 378 } else if (*CurPtr == 'y' || *CurPtr == 'Y') { 379 // binary number 380 ++CurPtr; 381 Radix = 2; 382 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr && 383 DefaultRadix < 14 && 384 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) { 385 Radix = 10; 386 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr && 387 DefaultRadix < 12 && 388 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) { 389 Radix = 2; 390 } 391 392 if (Radix) { 393 StringRef Result(TokStart, CurPtr - TokStart); 394 APInt Value(128, 0, true); 395 396 if (Result.drop_back().getAsInteger(Radix, Value)) 397 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); 398 399 // MSVC accepts and ignores type suffices on integer literals. 400 SkipIgnoredIntegerSuffix(CurPtr); 401 402 return intToken(Result, Value); 403 } 404 405 // default-radix integers, or floating point numbers, fall through 406 CurPtr = OldCurPtr; 407 } 408 409 // MASM default-radix integers: [0-9a-fA-F]+ 410 // (All other integer literals have a radix specifier.) 411 if (LexMasmIntegers && UseMasmDefaultRadix) { 412 CurPtr = findLastDigit(CurPtr, 16); 413 StringRef Result(TokStart, CurPtr - TokStart); 414 415 APInt Value(128, 0, true); 416 if (Result.getAsInteger(DefaultRadix, Value)) { 417 return ReturnError(TokStart, 418 "invalid " + radixName(DefaultRadix) + " number"); 419 } 420 421 return intToken(Result, Value); 422 } 423 424 // Decimal integer: [1-9][0-9]* 425 if (CurPtr[-1] != '0' || CurPtr[0] == '.') { 426 unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers); 427 bool isHex = Radix == 16; 428 // Check for floating point literals. 429 if (!isHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) { 430 if (*CurPtr == '.') 431 ++CurPtr; 432 return LexFloatLiteral(); 433 } 434 435 StringRef Result(TokStart, CurPtr - TokStart); 436 437 APInt Value(128, 0, true); 438 if (Result.getAsInteger(Radix, Value)) { 439 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); 440 } 441 442 // The darwin/x86 (and x86-64) assembler accepts and ignores type 443 // suffices on integer literals. 444 SkipIgnoredIntegerSuffix(CurPtr); 445 446 return intToken(Result, Value); 447 } 448 449 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) { 450 ++CurPtr; 451 // See if we actually have "0b" as part of something like "jmp 0b\n" 452 if (!isDigit(CurPtr[0])) { 453 --CurPtr; 454 StringRef Result(TokStart, CurPtr - TokStart); 455 return AsmToken(AsmToken::Integer, Result, 0); 456 } 457 const char *NumStart = CurPtr; 458 while (CurPtr[0] == '0' || CurPtr[0] == '1') 459 ++CurPtr; 460 461 // Requires at least one binary digit. 462 if (CurPtr == NumStart) 463 return ReturnError(TokStart, "invalid binary number"); 464 465 StringRef Result(TokStart, CurPtr - TokStart); 466 467 APInt Value(128, 0, true); 468 if (Result.substr(2).getAsInteger(2, Value)) 469 return ReturnError(TokStart, "invalid binary number"); 470 471 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 472 // suffixes on integer literals. 473 SkipIgnoredIntegerSuffix(CurPtr); 474 475 return intToken(Result, Value); 476 } 477 478 if ((*CurPtr == 'x') || (*CurPtr == 'X')) { 479 ++CurPtr; 480 const char *NumStart = CurPtr; 481 while (isHexDigit(CurPtr[0])) 482 ++CurPtr; 483 484 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be 485 // diagnosed by LexHexFloatLiteral). 486 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P') 487 return LexHexFloatLiteral(NumStart == CurPtr); 488 489 // Otherwise requires at least one hex digit. 490 if (CurPtr == NumStart) 491 return ReturnError(CurPtr-2, "invalid hexadecimal number"); 492 493 APInt Result(128, 0); 494 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 495 return ReturnError(TokStart, "invalid hexadecimal number"); 496 497 // Consume the optional [hH]. 498 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H')) 499 ++CurPtr; 500 501 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 502 // suffixes on integer literals. 503 SkipIgnoredIntegerSuffix(CurPtr); 504 505 return intToken(StringRef(TokStart, CurPtr - TokStart), Result); 506 } 507 508 // Either octal or hexadecimal. 509 APInt Value(128, 0, true); 510 unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers); 511 StringRef Result(TokStart, CurPtr - TokStart); 512 if (Result.getAsInteger(Radix, Value)) 513 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); 514 515 // Consume the [hH]. 516 if (Radix == 16) 517 ++CurPtr; 518 519 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 520 // suffixes on integer literals. 521 SkipIgnoredIntegerSuffix(CurPtr); 522 523 return intToken(Result, Value); 524 } 525 526 /// LexSingleQuote: Integer: 'b' 527 AsmToken AsmLexer::LexSingleQuote() { 528 int CurChar = getNextChar(); 529 530 if (LexMasmStrings) { 531 while (CurChar != EOF) { 532 if (CurChar != '\'') { 533 CurChar = getNextChar(); 534 } else if (peekNextChar() == '\'') { 535 // In MASM single-quote strings, doubled single-quotes mean an escaped 536 // single quote, so should be lexed in. 537 getNextChar(); 538 CurChar = getNextChar(); 539 } else { 540 break; 541 } 542 } 543 if (CurChar == EOF) 544 return ReturnError(TokStart, "unterminated string constant"); 545 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 546 } 547 548 if (CurChar == '\\') 549 CurChar = getNextChar(); 550 551 if (CurChar == EOF) 552 return ReturnError(TokStart, "unterminated single quote"); 553 554 CurChar = getNextChar(); 555 556 if (CurChar != '\'') 557 return ReturnError(TokStart, "single quote way too long"); 558 559 // The idea here being that 'c' is basically just an integral 560 // constant. 561 StringRef Res = StringRef(TokStart,CurPtr - TokStart); 562 long long Value; 563 564 if (Res.startswith("\'\\")) { 565 char theChar = Res[2]; 566 switch (theChar) { 567 default: Value = theChar; break; 568 case '\'': Value = '\''; break; 569 case 't': Value = '\t'; break; 570 case 'n': Value = '\n'; break; 571 case 'b': Value = '\b'; break; 572 } 573 } else 574 Value = TokStart[1]; 575 576 return AsmToken(AsmToken::Integer, Res, Value); 577 } 578 579 /// LexQuote: String: "..." 580 AsmToken AsmLexer::LexQuote() { 581 int CurChar = getNextChar(); 582 if (LexMasmStrings) { 583 while (CurChar != EOF) { 584 if (CurChar != '"') { 585 CurChar = getNextChar(); 586 } else if (peekNextChar() == '"') { 587 // In MASM double-quoted strings, doubled double-quotes mean an escaped 588 // double quote, so should be lexed in. 589 getNextChar(); 590 CurChar = getNextChar(); 591 } else { 592 break; 593 } 594 } 595 if (CurChar == EOF) 596 return ReturnError(TokStart, "unterminated string constant"); 597 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 598 } 599 600 // TODO: does gas allow multiline string constants? 601 while (CurChar != '"') { 602 if (CurChar == '\\') { 603 // Allow \", etc. 604 CurChar = getNextChar(); 605 } 606 607 if (CurChar == EOF) 608 return ReturnError(TokStart, "unterminated string constant"); 609 610 CurChar = getNextChar(); 611 } 612 613 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 614 } 615 616 StringRef AsmLexer::LexUntilEndOfStatement() { 617 TokStart = CurPtr; 618 619 while (!isAtStartOfComment(CurPtr) && // Start of line comment. 620 !isAtStatementSeparator(CurPtr) && // End of statement marker. 621 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { 622 ++CurPtr; 623 } 624 return StringRef(TokStart, CurPtr-TokStart); 625 } 626 627 StringRef AsmLexer::LexUntilEndOfLine() { 628 TokStart = CurPtr; 629 630 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { 631 ++CurPtr; 632 } 633 return StringRef(TokStart, CurPtr-TokStart); 634 } 635 636 size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf, 637 bool ShouldSkipSpace) { 638 SaveAndRestore<const char *> SavedTokenStart(TokStart); 639 SaveAndRestore<const char *> SavedCurPtr(CurPtr); 640 SaveAndRestore<bool> SavedAtStartOfLine(IsAtStartOfLine); 641 SaveAndRestore<bool> SavedAtStartOfStatement(IsAtStartOfStatement); 642 SaveAndRestore<bool> SavedSkipSpace(SkipSpace, ShouldSkipSpace); 643 SaveAndRestore<bool> SavedIsPeeking(IsPeeking, true); 644 std::string SavedErr = getErr(); 645 SMLoc SavedErrLoc = getErrLoc(); 646 647 size_t ReadCount; 648 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) { 649 AsmToken Token = LexToken(); 650 651 Buf[ReadCount] = Token; 652 653 if (Token.is(AsmToken::Eof)) 654 break; 655 } 656 657 SetError(SavedErrLoc, SavedErr); 658 return ReadCount; 659 } 660 661 bool AsmLexer::isAtStartOfComment(const char *Ptr) { 662 StringRef CommentString = MAI.getCommentString(); 663 664 if (CommentString.size() == 1) 665 return CommentString[0] == Ptr[0]; 666 667 // Allow # preprocessor commments also be counted as comments for "##" cases 668 if (CommentString[1] == '#') 669 return CommentString[0] == Ptr[0]; 670 671 return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0; 672 } 673 674 bool AsmLexer::isAtStatementSeparator(const char *Ptr) { 675 return strncmp(Ptr, MAI.getSeparatorString(), 676 strlen(MAI.getSeparatorString())) == 0; 677 } 678 679 AsmToken AsmLexer::LexToken() { 680 TokStart = CurPtr; 681 // This always consumes at least one character. 682 int CurChar = getNextChar(); 683 684 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) { 685 // If this starts with a '#', this may be a cpp 686 // hash directive and otherwise a line comment. 687 AsmToken TokenBuf[2]; 688 MutableArrayRef<AsmToken> Buf(TokenBuf, 2); 689 size_t num = peekTokens(Buf, true); 690 // There cannot be a space preceding this 691 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) && 692 TokenBuf[1].is(AsmToken::String)) { 693 CurPtr = TokStart; // reset curPtr; 694 StringRef s = LexUntilEndOfLine(); 695 UnLex(TokenBuf[1]); 696 UnLex(TokenBuf[0]); 697 return AsmToken(AsmToken::HashDirective, s); 698 } 699 return LexLineComment(); 700 } 701 702 if (isAtStartOfComment(TokStart)) 703 return LexLineComment(); 704 705 if (isAtStatementSeparator(TokStart)) { 706 CurPtr += strlen(MAI.getSeparatorString()) - 1; 707 IsAtStartOfLine = true; 708 IsAtStartOfStatement = true; 709 return AsmToken(AsmToken::EndOfStatement, 710 StringRef(TokStart, strlen(MAI.getSeparatorString()))); 711 } 712 713 // If we're missing a newline at EOF, make sure we still get an 714 // EndOfStatement token before the Eof token. 715 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) { 716 IsAtStartOfLine = true; 717 IsAtStartOfStatement = true; 718 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0)); 719 } 720 IsAtStartOfLine = false; 721 bool OldIsAtStartOfStatement = IsAtStartOfStatement; 722 IsAtStartOfStatement = false; 723 switch (CurChar) { 724 default: 725 if (MAI.doesAllowSymbolAtNameStart()) { 726 // Handle Microsoft-style identifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@?]* 727 if (!isDigit(CurChar) && 728 IsIdentifierChar(CurChar, MAI.doesAllowAtInName())) 729 return LexIdentifier(); 730 } else { 731 // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 732 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') 733 return LexIdentifier(); 734 } 735 736 // Unknown character, emit an error. 737 return ReturnError(TokStart, "invalid character in input"); 738 case EOF: 739 if (EndStatementAtEOF) { 740 IsAtStartOfLine = true; 741 IsAtStartOfStatement = true; 742 } 743 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 744 case 0: 745 case ' ': 746 case '\t': 747 IsAtStartOfStatement = OldIsAtStartOfStatement; 748 while (*CurPtr == ' ' || *CurPtr == '\t') 749 CurPtr++; 750 if (SkipSpace) 751 return LexToken(); // Ignore whitespace. 752 else 753 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart)); 754 case '\r': { 755 IsAtStartOfLine = true; 756 IsAtStartOfStatement = true; 757 // If this is a CR followed by LF, treat that as one token. 758 if (CurPtr != CurBuf.end() && *CurPtr == '\n') 759 ++CurPtr; 760 return AsmToken(AsmToken::EndOfStatement, 761 StringRef(TokStart, CurPtr - TokStart)); 762 } 763 case '\n': 764 IsAtStartOfLine = true; 765 IsAtStartOfStatement = true; 766 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 767 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 768 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 769 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 770 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 771 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 772 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 773 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 774 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 775 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 776 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 777 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 778 case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 779 case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 780 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); 781 case '=': 782 if (*CurPtr == '=') { 783 ++CurPtr; 784 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 785 } 786 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 787 case '-': 788 if (*CurPtr == '>') { 789 ++CurPtr; 790 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2)); 791 } 792 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 793 case '|': 794 if (*CurPtr == '|') { 795 ++CurPtr; 796 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 797 } 798 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 799 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 800 case '&': 801 if (*CurPtr == '&') { 802 ++CurPtr; 803 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 804 } 805 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 806 case '!': 807 if (*CurPtr == '=') { 808 ++CurPtr; 809 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 810 } 811 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 812 case '%': 813 if (MAI.hasMipsExpressions()) { 814 AsmToken::TokenKind Operator; 815 unsigned OperatorLength; 816 817 std::tie(Operator, OperatorLength) = 818 StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>( 819 StringRef(CurPtr)) 820 .StartsWith("call16", {AsmToken::PercentCall16, 7}) 821 .StartsWith("call_hi", {AsmToken::PercentCall_Hi, 8}) 822 .StartsWith("call_lo", {AsmToken::PercentCall_Lo, 8}) 823 .StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi, 10}) 824 .StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo, 10}) 825 .StartsWith("got_disp", {AsmToken::PercentGot_Disp, 9}) 826 .StartsWith("got_hi", {AsmToken::PercentGot_Hi, 7}) 827 .StartsWith("got_lo", {AsmToken::PercentGot_Lo, 7}) 828 .StartsWith("got_ofst", {AsmToken::PercentGot_Ofst, 9}) 829 .StartsWith("got_page", {AsmToken::PercentGot_Page, 9}) 830 .StartsWith("gottprel", {AsmToken::PercentGottprel, 9}) 831 .StartsWith("got", {AsmToken::PercentGot, 4}) 832 .StartsWith("gp_rel", {AsmToken::PercentGp_Rel, 7}) 833 .StartsWith("higher", {AsmToken::PercentHigher, 7}) 834 .StartsWith("highest", {AsmToken::PercentHighest, 8}) 835 .StartsWith("hi", {AsmToken::PercentHi, 3}) 836 .StartsWith("lo", {AsmToken::PercentLo, 3}) 837 .StartsWith("neg", {AsmToken::PercentNeg, 4}) 838 .StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi, 9}) 839 .StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo, 9}) 840 .StartsWith("tlsgd", {AsmToken::PercentTlsgd, 6}) 841 .StartsWith("tlsldm", {AsmToken::PercentTlsldm, 7}) 842 .StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi, 9}) 843 .StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo, 9}) 844 .Default({AsmToken::Percent, 1}); 845 846 if (Operator != AsmToken::Percent) { 847 CurPtr += OperatorLength - 1; 848 return AsmToken(Operator, StringRef(TokStart, OperatorLength)); 849 } 850 } 851 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 852 case '/': 853 IsAtStartOfStatement = OldIsAtStartOfStatement; 854 return LexSlash(); 855 case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 856 case '\'': return LexSingleQuote(); 857 case '"': return LexQuote(); 858 case '0': case '1': case '2': case '3': case '4': 859 case '5': case '6': case '7': case '8': case '9': 860 return LexDigit(); 861 case '<': 862 switch (*CurPtr) { 863 case '<': 864 ++CurPtr; 865 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2)); 866 case '=': 867 ++CurPtr; 868 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2)); 869 case '>': 870 ++CurPtr; 871 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2)); 872 default: 873 return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 874 } 875 case '>': 876 switch (*CurPtr) { 877 case '>': 878 ++CurPtr; 879 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2)); 880 case '=': 881 ++CurPtr; 882 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2)); 883 default: 884 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 885 } 886 887 // TODO: Quoted identifiers (objc methods etc) 888 // local labels: [0-9][:] 889 // Forward/backward labels: [0-9][fb] 890 // Integers, fp constants, character constants. 891 } 892 } 893