1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This class implements the lexer for assembly files. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/MC/MCParser/AsmLexer.h" 14 #include "llvm/ADT/APInt.h" 15 #include "llvm/ADT/ArrayRef.h" 16 #include "llvm/ADT/StringExtras.h" 17 #include "llvm/ADT/StringRef.h" 18 #include "llvm/ADT/StringSwitch.h" 19 #include "llvm/MC/MCAsmInfo.h" 20 #include "llvm/MC/MCParser/MCAsmLexer.h" 21 #include "llvm/Support/Compiler.h" 22 #include "llvm/Support/SMLoc.h" 23 #include "llvm/Support/SaveAndRestore.h" 24 #include <cassert> 25 #include <cctype> 26 #include <cstdio> 27 #include <cstring> 28 #include <string> 29 #include <tuple> 30 #include <utility> 31 32 using namespace llvm; 33 34 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) { 35 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@"); 36 LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers(); 37 } 38 39 AsmLexer::~AsmLexer() = default; 40 41 void AsmLexer::setBuffer(StringRef Buf, const char *ptr, 42 bool EndStatementAtEOF) { 43 CurBuf = Buf; 44 45 if (ptr) 46 CurPtr = ptr; 47 else 48 CurPtr = CurBuf.begin(); 49 50 TokStart = nullptr; 51 this->EndStatementAtEOF = EndStatementAtEOF; 52 } 53 54 /// ReturnError - Set the error to the specified string at the specified 55 /// location. This is defined to always return AsmToken::Error. 56 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 57 SetError(SMLoc::getFromPointer(Loc), Msg); 58 59 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc)); 60 } 61 62 int AsmLexer::getNextChar() { 63 if (CurPtr == CurBuf.end()) 64 return EOF; 65 return (unsigned char)*CurPtr++; 66 } 67 68 int AsmLexer::peekNextChar() { 69 if (CurPtr == CurBuf.end()) 70 return EOF; 71 return (unsigned char)*CurPtr; 72 } 73 74 /// The leading integral digit sequence and dot should have already been 75 /// consumed, some or all of the fractional digit sequence *can* have been 76 /// consumed. 77 AsmToken AsmLexer::LexFloatLiteral() { 78 // Skip the fractional digit sequence. 79 while (isDigit(*CurPtr)) 80 ++CurPtr; 81 82 if (*CurPtr == '-' || *CurPtr == '+') 83 return ReturnError(CurPtr, "invalid sign in float literal"); 84 85 // Check for exponent 86 if ((*CurPtr == 'e' || *CurPtr == 'E')) { 87 ++CurPtr; 88 89 if (*CurPtr == '-' || *CurPtr == '+') 90 ++CurPtr; 91 92 while (isDigit(*CurPtr)) 93 ++CurPtr; 94 } 95 96 return AsmToken(AsmToken::Real, 97 StringRef(TokStart, CurPtr - TokStart)); 98 } 99 100 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+ 101 /// while making sure there are enough actual digits around for the constant to 102 /// be valid. 103 /// 104 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed 105 /// before we get here. 106 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) { 107 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') && 108 "unexpected parse state in floating hex"); 109 bool NoFracDigits = true; 110 111 // Skip the fractional part if there is one 112 if (*CurPtr == '.') { 113 ++CurPtr; 114 115 const char *FracStart = CurPtr; 116 while (isHexDigit(*CurPtr)) 117 ++CurPtr; 118 119 NoFracDigits = CurPtr == FracStart; 120 } 121 122 if (NoIntDigits && NoFracDigits) 123 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 124 "expected at least one significand digit"); 125 126 // Make sure we do have some kind of proper exponent part 127 if (*CurPtr != 'p' && *CurPtr != 'P') 128 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 129 "expected exponent part 'p'"); 130 ++CurPtr; 131 132 if (*CurPtr == '+' || *CurPtr == '-') 133 ++CurPtr; 134 135 // N.b. exponent digits are *not* hex 136 const char *ExpStart = CurPtr; 137 while (isDigit(*CurPtr)) 138 ++CurPtr; 139 140 if (CurPtr == ExpStart) 141 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 142 "expected at least one exponent digit"); 143 144 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); 145 } 146 147 /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]* 148 static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) { 149 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' || 150 (AllowAt && C == '@') || (AllowHash && C == '#'); 151 } 152 153 AsmToken AsmLexer::LexIdentifier() { 154 // Check for floating point literals. 155 if (CurPtr[-1] == '.' && isDigit(*CurPtr)) { 156 // Disambiguate a .1243foo identifier from a floating literal. 157 while (isDigit(*CurPtr)) 158 ++CurPtr; 159 160 if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier, 161 AllowHashInIdentifier) || 162 *CurPtr == 'e' || *CurPtr == 'E') 163 return LexFloatLiteral(); 164 } 165 166 while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier)) 167 ++CurPtr; 168 169 // Handle . as a special case. 170 if (CurPtr == TokStart+1 && TokStart[0] == '.') 171 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 172 173 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 174 } 175 176 /// LexSlash: Slash: / 177 /// C-Style Comment: /* ... */ 178 /// C-style Comment: // ... 179 AsmToken AsmLexer::LexSlash() { 180 if (!MAI.shouldAllowAdditionalComments()) { 181 IsAtStartOfStatement = false; 182 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1)); 183 } 184 185 switch (*CurPtr) { 186 case '*': 187 IsAtStartOfStatement = false; 188 break; // C style comment. 189 case '/': 190 ++CurPtr; 191 return LexLineComment(); 192 default: 193 IsAtStartOfStatement = false; 194 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1)); 195 } 196 197 // C Style comment. 198 ++CurPtr; // skip the star. 199 const char *CommentTextStart = CurPtr; 200 while (CurPtr != CurBuf.end()) { 201 switch (*CurPtr++) { 202 case '*': 203 // End of the comment? 204 if (*CurPtr != '/') 205 break; 206 // If we have a CommentConsumer, notify it about the comment. 207 if (CommentConsumer) { 208 CommentConsumer->HandleComment( 209 SMLoc::getFromPointer(CommentTextStart), 210 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart)); 211 } 212 ++CurPtr; // End the */. 213 return AsmToken(AsmToken::Comment, 214 StringRef(TokStart, CurPtr - TokStart)); 215 } 216 } 217 return ReturnError(TokStart, "unterminated comment"); 218 } 219 220 /// LexLineComment: Comment: #[^\n]* 221 /// : //[^\n]* 222 AsmToken AsmLexer::LexLineComment() { 223 // Mark This as an end of statement with a body of the 224 // comment. While it would be nicer to leave this two tokens, 225 // backwards compatability with TargetParsers makes keeping this in this form 226 // better. 227 const char *CommentTextStart = CurPtr; 228 int CurChar = getNextChar(); 229 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) 230 CurChar = getNextChar(); 231 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n') 232 ++CurPtr; 233 234 // If we have a CommentConsumer, notify it about the comment. 235 if (CommentConsumer) { 236 CommentConsumer->HandleComment( 237 SMLoc::getFromPointer(CommentTextStart), 238 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart)); 239 } 240 241 IsAtStartOfLine = true; 242 // This is a whole line comment. leave newline 243 if (IsAtStartOfStatement) 244 return AsmToken(AsmToken::EndOfStatement, 245 StringRef(TokStart, CurPtr - TokStart)); 246 IsAtStartOfStatement = true; 247 248 return AsmToken(AsmToken::EndOfStatement, 249 StringRef(TokStart, CurPtr - 1 - TokStart)); 250 } 251 252 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 253 // Skip ULL, UL, U, L and LL suffices. 254 if (CurPtr[0] == 'U') 255 ++CurPtr; 256 if (CurPtr[0] == 'L') 257 ++CurPtr; 258 if (CurPtr[0] == 'L') 259 ++CurPtr; 260 } 261 262 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the 263 // integer as a hexadecimal, possibly with leading zeroes. 264 static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix, 265 bool LexHex) { 266 const char *FirstNonDec = nullptr; 267 const char *LookAhead = CurPtr; 268 while (true) { 269 if (isDigit(*LookAhead)) { 270 ++LookAhead; 271 } else { 272 if (!FirstNonDec) 273 FirstNonDec = LookAhead; 274 275 // Keep going if we are looking for a 'h' suffix. 276 if (LexHex && isHexDigit(*LookAhead)) 277 ++LookAhead; 278 else 279 break; 280 } 281 } 282 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H'); 283 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec; 284 if (isHex) 285 return 16; 286 return DefaultRadix; 287 } 288 289 static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) { 290 while (hexDigitValue(*CurPtr) < DefaultRadix) { 291 ++CurPtr; 292 } 293 return CurPtr; 294 } 295 296 static AsmToken intToken(StringRef Ref, APInt &Value) { 297 if (Value.isIntN(64)) 298 return AsmToken(AsmToken::Integer, Ref, Value); 299 return AsmToken(AsmToken::BigNum, Ref, Value); 300 } 301 302 static std::string radixName(unsigned Radix) { 303 switch (Radix) { 304 case 2: 305 return "binary"; 306 case 8: 307 return "octal"; 308 case 10: 309 return "decimal"; 310 case 16: 311 return "hexadecimal"; 312 default: 313 return "base-" + std::to_string(Radix); 314 } 315 } 316 317 /// LexDigit: First character is [0-9]. 318 /// Local Label: [0-9][:] 319 /// Forward/Backward Label: [0-9][fb] 320 /// Binary integer: 0b[01]+ 321 /// Octal integer: 0[0-7]+ 322 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH] 323 /// Decimal integer: [1-9][0-9]* 324 AsmToken AsmLexer::LexDigit() { 325 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY]) 326 // MASM-flavor octal integer: [0-7]+[oOqQ] 327 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT]) 328 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH] 329 if (LexMasmIntegers && isdigit(CurPtr[-1])) { 330 const char *FirstNonBinary = 331 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr; 332 const char *FirstNonDecimal = 333 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr; 334 const char *OldCurPtr = CurPtr; 335 while (isHexDigit(*CurPtr)) { 336 switch (*CurPtr) { 337 default: 338 if (!FirstNonDecimal) { 339 FirstNonDecimal = CurPtr; 340 } 341 LLVM_FALLTHROUGH; 342 case '9': 343 case '8': 344 case '7': 345 case '6': 346 case '5': 347 case '4': 348 case '3': 349 case '2': 350 if (!FirstNonBinary) { 351 FirstNonBinary = CurPtr; 352 } 353 break; 354 case '1': 355 case '0': 356 break; 357 } 358 ++CurPtr; 359 } 360 if (*CurPtr == '.') { 361 // MASM float literals (other than hex floats) always contain a ".", and 362 // are always written in decimal. 363 ++CurPtr; 364 return LexFloatLiteral(); 365 } 366 367 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) { 368 ++CurPtr; 369 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); 370 } 371 372 unsigned Radix = 0; 373 if (*CurPtr == 'h' || *CurPtr == 'H') { 374 // hexadecimal number 375 ++CurPtr; 376 Radix = 16; 377 } else if (*CurPtr == 't' || *CurPtr == 'T') { 378 // decimal number 379 ++CurPtr; 380 Radix = 10; 381 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' || 382 *CurPtr == 'Q') { 383 // octal number 384 ++CurPtr; 385 Radix = 8; 386 } else if (*CurPtr == 'y' || *CurPtr == 'Y') { 387 // binary number 388 ++CurPtr; 389 Radix = 2; 390 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr && 391 DefaultRadix < 14 && 392 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) { 393 Radix = 10; 394 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr && 395 DefaultRadix < 12 && 396 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) { 397 Radix = 2; 398 } 399 400 if (Radix) { 401 StringRef Result(TokStart, CurPtr - TokStart); 402 APInt Value(128, 0, true); 403 404 if (Result.drop_back().getAsInteger(Radix, Value)) 405 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); 406 407 // MSVC accepts and ignores type suffices on integer literals. 408 SkipIgnoredIntegerSuffix(CurPtr); 409 410 return intToken(Result, Value); 411 } 412 413 // default-radix integers, or floating point numbers, fall through 414 CurPtr = OldCurPtr; 415 } 416 417 // MASM default-radix integers: [0-9a-fA-F]+ 418 // (All other integer literals have a radix specifier.) 419 if (LexMasmIntegers && UseMasmDefaultRadix) { 420 CurPtr = findLastDigit(CurPtr, 16); 421 StringRef Result(TokStart, CurPtr - TokStart); 422 423 APInt Value(128, 0, true); 424 if (Result.getAsInteger(DefaultRadix, Value)) { 425 return ReturnError(TokStart, 426 "invalid " + radixName(DefaultRadix) + " number"); 427 } 428 429 return intToken(Result, Value); 430 } 431 432 // Motorola hex integers: $[0-9a-fA-F]+ 433 if (LexMotorolaIntegers && CurPtr[-1] == '$') { 434 const char *NumStart = CurPtr; 435 while (isHexDigit(CurPtr[0])) 436 ++CurPtr; 437 438 APInt Result(128, 0); 439 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result)) 440 return ReturnError(TokStart, "invalid hexadecimal number"); 441 442 return intToken(StringRef(TokStart, CurPtr - TokStart), Result); 443 } 444 445 // Motorola binary integers: %[01]+ 446 if (LexMotorolaIntegers && CurPtr[-1] == '%') { 447 const char *NumStart = CurPtr; 448 while (*CurPtr == '0' || *CurPtr == '1') 449 ++CurPtr; 450 451 APInt Result(128, 0); 452 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result)) 453 return ReturnError(TokStart, "invalid binary number"); 454 455 return intToken(StringRef(TokStart, CurPtr - TokStart), Result); 456 } 457 458 // Decimal integer: [1-9][0-9]* 459 // HLASM-flavour decimal integer: [0-9][0-9]* 460 // FIXME: Later on, support for fb for HLASM has to be added in 461 // as they probably would be needed for asm goto 462 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') { 463 unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers); 464 465 if (!LexHLASMIntegers) { 466 bool IsHex = Radix == 16; 467 // Check for floating point literals. 468 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) { 469 if (*CurPtr == '.') 470 ++CurPtr; 471 return LexFloatLiteral(); 472 } 473 } 474 475 StringRef Result(TokStart, CurPtr - TokStart); 476 477 APInt Value(128, 0, true); 478 if (Result.getAsInteger(Radix, Value)) 479 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); 480 481 if (!LexHLASMIntegers) 482 // The darwin/x86 (and x86-64) assembler accepts and ignores type 483 // suffices on integer literals. 484 SkipIgnoredIntegerSuffix(CurPtr); 485 486 return intToken(Result, Value); 487 } 488 489 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) { 490 ++CurPtr; 491 // See if we actually have "0b" as part of something like "jmp 0b\n" 492 if (!isDigit(CurPtr[0])) { 493 --CurPtr; 494 StringRef Result(TokStart, CurPtr - TokStart); 495 return AsmToken(AsmToken::Integer, Result, 0); 496 } 497 const char *NumStart = CurPtr; 498 while (CurPtr[0] == '0' || CurPtr[0] == '1') 499 ++CurPtr; 500 501 // Requires at least one binary digit. 502 if (CurPtr == NumStart) 503 return ReturnError(TokStart, "invalid binary number"); 504 505 StringRef Result(TokStart, CurPtr - TokStart); 506 507 APInt Value(128, 0, true); 508 if (Result.substr(2).getAsInteger(2, Value)) 509 return ReturnError(TokStart, "invalid binary number"); 510 511 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 512 // suffixes on integer literals. 513 SkipIgnoredIntegerSuffix(CurPtr); 514 515 return intToken(Result, Value); 516 } 517 518 if ((*CurPtr == 'x') || (*CurPtr == 'X')) { 519 ++CurPtr; 520 const char *NumStart = CurPtr; 521 while (isHexDigit(CurPtr[0])) 522 ++CurPtr; 523 524 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be 525 // diagnosed by LexHexFloatLiteral). 526 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P') 527 return LexHexFloatLiteral(NumStart == CurPtr); 528 529 // Otherwise requires at least one hex digit. 530 if (CurPtr == NumStart) 531 return ReturnError(CurPtr-2, "invalid hexadecimal number"); 532 533 APInt Result(128, 0); 534 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 535 return ReturnError(TokStart, "invalid hexadecimal number"); 536 537 // Consume the optional [hH]. 538 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H')) 539 ++CurPtr; 540 541 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 542 // suffixes on integer literals. 543 SkipIgnoredIntegerSuffix(CurPtr); 544 545 return intToken(StringRef(TokStart, CurPtr - TokStart), Result); 546 } 547 548 // Either octal or hexadecimal. 549 APInt Value(128, 0, true); 550 unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers); 551 StringRef Result(TokStart, CurPtr - TokStart); 552 if (Result.getAsInteger(Radix, Value)) 553 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); 554 555 // Consume the [hH]. 556 if (Radix == 16) 557 ++CurPtr; 558 559 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 560 // suffixes on integer literals. 561 SkipIgnoredIntegerSuffix(CurPtr); 562 563 return intToken(Result, Value); 564 } 565 566 /// LexSingleQuote: Integer: 'b' 567 AsmToken AsmLexer::LexSingleQuote() { 568 int CurChar = getNextChar(); 569 570 if (LexHLASMStrings) 571 return ReturnError(TokStart, "invalid usage of character literals"); 572 573 if (LexMasmStrings) { 574 while (CurChar != EOF) { 575 if (CurChar != '\'') { 576 CurChar = getNextChar(); 577 } else if (peekNextChar() == '\'') { 578 // In MASM single-quote strings, doubled single-quotes mean an escaped 579 // single quote, so should be lexed in. 580 getNextChar(); 581 CurChar = getNextChar(); 582 } else { 583 break; 584 } 585 } 586 if (CurChar == EOF) 587 return ReturnError(TokStart, "unterminated string constant"); 588 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 589 } 590 591 if (CurChar == '\\') 592 CurChar = getNextChar(); 593 594 if (CurChar == EOF) 595 return ReturnError(TokStart, "unterminated single quote"); 596 597 CurChar = getNextChar(); 598 599 if (CurChar != '\'') 600 return ReturnError(TokStart, "single quote way too long"); 601 602 // The idea here being that 'c' is basically just an integral 603 // constant. 604 StringRef Res = StringRef(TokStart,CurPtr - TokStart); 605 long long Value; 606 607 if (Res.startswith("\'\\")) { 608 char theChar = Res[2]; 609 switch (theChar) { 610 default: Value = theChar; break; 611 case '\'': Value = '\''; break; 612 case 't': Value = '\t'; break; 613 case 'n': Value = '\n'; break; 614 case 'b': Value = '\b'; break; 615 case 'f': Value = '\f'; break; 616 case 'r': Value = '\r'; break; 617 } 618 } else 619 Value = TokStart[1]; 620 621 return AsmToken(AsmToken::Integer, Res, Value); 622 } 623 624 /// LexQuote: String: "..." 625 AsmToken AsmLexer::LexQuote() { 626 int CurChar = getNextChar(); 627 if (LexHLASMStrings) 628 return ReturnError(TokStart, "invalid usage of string literals"); 629 630 if (LexMasmStrings) { 631 while (CurChar != EOF) { 632 if (CurChar != '"') { 633 CurChar = getNextChar(); 634 } else if (peekNextChar() == '"') { 635 // In MASM double-quoted strings, doubled double-quotes mean an escaped 636 // double quote, so should be lexed in. 637 getNextChar(); 638 CurChar = getNextChar(); 639 } else { 640 break; 641 } 642 } 643 if (CurChar == EOF) 644 return ReturnError(TokStart, "unterminated string constant"); 645 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 646 } 647 648 // TODO: does gas allow multiline string constants? 649 while (CurChar != '"') { 650 if (CurChar == '\\') { 651 // Allow \", etc. 652 CurChar = getNextChar(); 653 } 654 655 if (CurChar == EOF) 656 return ReturnError(TokStart, "unterminated string constant"); 657 658 CurChar = getNextChar(); 659 } 660 661 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 662 } 663 664 StringRef AsmLexer::LexUntilEndOfStatement() { 665 TokStart = CurPtr; 666 667 while (!isAtStartOfComment(CurPtr) && // Start of line comment. 668 !isAtStatementSeparator(CurPtr) && // End of statement marker. 669 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { 670 ++CurPtr; 671 } 672 return StringRef(TokStart, CurPtr-TokStart); 673 } 674 675 StringRef AsmLexer::LexUntilEndOfLine() { 676 TokStart = CurPtr; 677 678 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { 679 ++CurPtr; 680 } 681 return StringRef(TokStart, CurPtr-TokStart); 682 } 683 684 size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf, 685 bool ShouldSkipSpace) { 686 SaveAndRestore<const char *> SavedTokenStart(TokStart); 687 SaveAndRestore<const char *> SavedCurPtr(CurPtr); 688 SaveAndRestore<bool> SavedAtStartOfLine(IsAtStartOfLine); 689 SaveAndRestore<bool> SavedAtStartOfStatement(IsAtStartOfStatement); 690 SaveAndRestore<bool> SavedSkipSpace(SkipSpace, ShouldSkipSpace); 691 SaveAndRestore<bool> SavedIsPeeking(IsPeeking, true); 692 std::string SavedErr = getErr(); 693 SMLoc SavedErrLoc = getErrLoc(); 694 695 size_t ReadCount; 696 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) { 697 AsmToken Token = LexToken(); 698 699 Buf[ReadCount] = Token; 700 701 if (Token.is(AsmToken::Eof)) 702 break; 703 } 704 705 SetError(SavedErrLoc, SavedErr); 706 return ReadCount; 707 } 708 709 bool AsmLexer::isAtStartOfComment(const char *Ptr) { 710 if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement) 711 return false; 712 713 StringRef CommentString = MAI.getCommentString(); 714 715 if (CommentString.size() == 1) 716 return CommentString[0] == Ptr[0]; 717 718 // Allow # preprocessor commments also be counted as comments for "##" cases 719 if (CommentString[1] == '#') 720 return CommentString[0] == Ptr[0]; 721 722 return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0; 723 } 724 725 bool AsmLexer::isAtStatementSeparator(const char *Ptr) { 726 return strncmp(Ptr, MAI.getSeparatorString(), 727 strlen(MAI.getSeparatorString())) == 0; 728 } 729 730 AsmToken AsmLexer::LexToken() { 731 TokStart = CurPtr; 732 // This always consumes at least one character. 733 int CurChar = getNextChar(); 734 735 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) { 736 // If this starts with a '#', this may be a cpp 737 // hash directive and otherwise a line comment. 738 AsmToken TokenBuf[2]; 739 MutableArrayRef<AsmToken> Buf(TokenBuf, 2); 740 size_t num = peekTokens(Buf, true); 741 // There cannot be a space preceding this 742 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) && 743 TokenBuf[1].is(AsmToken::String)) { 744 CurPtr = TokStart; // reset curPtr; 745 StringRef s = LexUntilEndOfLine(); 746 UnLex(TokenBuf[1]); 747 UnLex(TokenBuf[0]); 748 return AsmToken(AsmToken::HashDirective, s); 749 } 750 751 if (MAI.shouldAllowAdditionalComments()) 752 return LexLineComment(); 753 } 754 755 if (isAtStartOfComment(TokStart)) 756 return LexLineComment(); 757 758 if (isAtStatementSeparator(TokStart)) { 759 CurPtr += strlen(MAI.getSeparatorString()) - 1; 760 IsAtStartOfLine = true; 761 IsAtStartOfStatement = true; 762 return AsmToken(AsmToken::EndOfStatement, 763 StringRef(TokStart, strlen(MAI.getSeparatorString()))); 764 } 765 766 // If we're missing a newline at EOF, make sure we still get an 767 // EndOfStatement token before the Eof token. 768 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) { 769 IsAtStartOfLine = true; 770 IsAtStartOfStatement = true; 771 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0)); 772 } 773 IsAtStartOfLine = false; 774 bool OldIsAtStartOfStatement = IsAtStartOfStatement; 775 IsAtStartOfStatement = false; 776 switch (CurChar) { 777 default: 778 // Handle identifier: [a-zA-Z_.?][a-zA-Z0-9_$.@#?]* 779 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.' || 780 (MAI.doesAllowQuestionAtStartOfIdentifier() && CurChar == '?')) 781 return LexIdentifier(); 782 783 // Unknown character, emit an error. 784 return ReturnError(TokStart, "invalid character in input"); 785 case EOF: 786 if (EndStatementAtEOF) { 787 IsAtStartOfLine = true; 788 IsAtStartOfStatement = true; 789 } 790 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 791 case 0: 792 case ' ': 793 case '\t': 794 IsAtStartOfStatement = OldIsAtStartOfStatement; 795 while (*CurPtr == ' ' || *CurPtr == '\t') 796 CurPtr++; 797 if (SkipSpace) 798 return LexToken(); // Ignore whitespace. 799 else 800 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart)); 801 case '\r': { 802 IsAtStartOfLine = true; 803 IsAtStartOfStatement = true; 804 // If this is a CR followed by LF, treat that as one token. 805 if (CurPtr != CurBuf.end() && *CurPtr == '\n') 806 ++CurPtr; 807 return AsmToken(AsmToken::EndOfStatement, 808 StringRef(TokStart, CurPtr - TokStart)); 809 } 810 case '\n': 811 IsAtStartOfLine = true; 812 IsAtStartOfStatement = true; 813 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 814 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 815 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 816 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 817 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 818 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 819 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 820 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 821 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 822 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 823 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 824 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 825 case '$': { 826 if (LexMotorolaIntegers && isHexDigit(*CurPtr)) 827 return LexDigit(); 828 if (MAI.doesAllowDollarAtStartOfIdentifier()) 829 return LexIdentifier(); 830 return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 831 } 832 case '@': { 833 if (MAI.doesAllowAtAtStartOfIdentifier()) 834 return LexIdentifier(); 835 return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 836 } 837 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); 838 case '=': 839 if (*CurPtr == '=') { 840 ++CurPtr; 841 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 842 } 843 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 844 case '-': 845 if (*CurPtr == '>') { 846 ++CurPtr; 847 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2)); 848 } 849 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 850 case '|': 851 if (*CurPtr == '|') { 852 ++CurPtr; 853 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 854 } 855 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 856 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 857 case '&': 858 if (*CurPtr == '&') { 859 ++CurPtr; 860 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 861 } 862 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 863 case '!': 864 if (*CurPtr == '=') { 865 ++CurPtr; 866 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 867 } 868 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 869 case '%': 870 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) { 871 return LexDigit(); 872 } 873 874 if (MAI.hasMipsExpressions()) { 875 AsmToken::TokenKind Operator; 876 unsigned OperatorLength; 877 878 std::tie(Operator, OperatorLength) = 879 StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>( 880 StringRef(CurPtr)) 881 .StartsWith("call16", {AsmToken::PercentCall16, 7}) 882 .StartsWith("call_hi", {AsmToken::PercentCall_Hi, 8}) 883 .StartsWith("call_lo", {AsmToken::PercentCall_Lo, 8}) 884 .StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi, 10}) 885 .StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo, 10}) 886 .StartsWith("got_disp", {AsmToken::PercentGot_Disp, 9}) 887 .StartsWith("got_hi", {AsmToken::PercentGot_Hi, 7}) 888 .StartsWith("got_lo", {AsmToken::PercentGot_Lo, 7}) 889 .StartsWith("got_ofst", {AsmToken::PercentGot_Ofst, 9}) 890 .StartsWith("got_page", {AsmToken::PercentGot_Page, 9}) 891 .StartsWith("gottprel", {AsmToken::PercentGottprel, 9}) 892 .StartsWith("got", {AsmToken::PercentGot, 4}) 893 .StartsWith("gp_rel", {AsmToken::PercentGp_Rel, 7}) 894 .StartsWith("higher", {AsmToken::PercentHigher, 7}) 895 .StartsWith("highest", {AsmToken::PercentHighest, 8}) 896 .StartsWith("hi", {AsmToken::PercentHi, 3}) 897 .StartsWith("lo", {AsmToken::PercentLo, 3}) 898 .StartsWith("neg", {AsmToken::PercentNeg, 4}) 899 .StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi, 9}) 900 .StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo, 9}) 901 .StartsWith("tlsgd", {AsmToken::PercentTlsgd, 6}) 902 .StartsWith("tlsldm", {AsmToken::PercentTlsldm, 7}) 903 .StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi, 9}) 904 .StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo, 9}) 905 .Default({AsmToken::Percent, 1}); 906 907 if (Operator != AsmToken::Percent) { 908 CurPtr += OperatorLength - 1; 909 return AsmToken(Operator, StringRef(TokStart, OperatorLength)); 910 } 911 } 912 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 913 case '/': 914 IsAtStartOfStatement = OldIsAtStartOfStatement; 915 return LexSlash(); 916 case '#': { 917 if (MAI.doesAllowHashAtStartOfIdentifier()) 918 return LexIdentifier(); 919 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 920 } 921 case '\'': return LexSingleQuote(); 922 case '"': return LexQuote(); 923 case '0': case '1': case '2': case '3': case '4': 924 case '5': case '6': case '7': case '8': case '9': 925 return LexDigit(); 926 case '<': 927 switch (*CurPtr) { 928 case '<': 929 ++CurPtr; 930 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2)); 931 case '=': 932 ++CurPtr; 933 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2)); 934 case '>': 935 ++CurPtr; 936 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2)); 937 default: 938 return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 939 } 940 case '>': 941 switch (*CurPtr) { 942 case '>': 943 ++CurPtr; 944 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2)); 945 case '=': 946 ++CurPtr; 947 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2)); 948 default: 949 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 950 } 951 952 // TODO: Quoted identifiers (objc methods etc) 953 // local labels: [0-9][:] 954 // Forward/backward labels: [0-9][fb] 955 // Integers, fp constants, character constants. 956 } 957 } 958