1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This class implements the lexer for assembly files. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/MC/MCParser/AsmLexer.h" 14 #include "llvm/ADT/APInt.h" 15 #include "llvm/ADT/ArrayRef.h" 16 #include "llvm/ADT/StringExtras.h" 17 #include "llvm/ADT/StringRef.h" 18 #include "llvm/ADT/StringSwitch.h" 19 #include "llvm/MC/MCAsmInfo.h" 20 #include "llvm/MC/MCParser/MCAsmLexer.h" 21 #include "llvm/Support/Compiler.h" 22 #include "llvm/Support/SMLoc.h" 23 #include "llvm/Support/SaveAndRestore.h" 24 #include <cassert> 25 #include <cctype> 26 #include <cstdio> 27 #include <cstring> 28 #include <string> 29 #include <tuple> 30 #include <utility> 31 32 using namespace llvm; 33 34 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) { 35 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@"); 36 LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers(); 37 } 38 39 AsmLexer::~AsmLexer() = default; 40 41 void AsmLexer::setBuffer(StringRef Buf, const char *ptr, 42 bool EndStatementAtEOF) { 43 CurBuf = Buf; 44 45 if (ptr) 46 CurPtr = ptr; 47 else 48 CurPtr = CurBuf.begin(); 49 50 TokStart = nullptr; 51 this->EndStatementAtEOF = EndStatementAtEOF; 52 } 53 54 /// ReturnError - Set the error to the specified string at the specified 55 /// location. This is defined to always return AsmToken::Error. 56 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 57 SetError(SMLoc::getFromPointer(Loc), Msg); 58 59 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc)); 60 } 61 62 int AsmLexer::getNextChar() { 63 if (CurPtr == CurBuf.end()) 64 return EOF; 65 return (unsigned char)*CurPtr++; 66 } 67 68 int AsmLexer::peekNextChar() { 69 if (CurPtr == CurBuf.end()) 70 return EOF; 71 return (unsigned char)*CurPtr; 72 } 73 74 /// The leading integral digit sequence and dot should have already been 75 /// consumed, some or all of the fractional digit sequence *can* have been 76 /// consumed. 77 AsmToken AsmLexer::LexFloatLiteral() { 78 // Skip the fractional digit sequence. 79 while (isDigit(*CurPtr)) 80 ++CurPtr; 81 82 if (*CurPtr == '-' || *CurPtr == '+') 83 return ReturnError(CurPtr, "invalid sign in float literal"); 84 85 // Check for exponent 86 if ((*CurPtr == 'e' || *CurPtr == 'E')) { 87 ++CurPtr; 88 89 if (*CurPtr == '-' || *CurPtr == '+') 90 ++CurPtr; 91 92 while (isDigit(*CurPtr)) 93 ++CurPtr; 94 } 95 96 return AsmToken(AsmToken::Real, 97 StringRef(TokStart, CurPtr - TokStart)); 98 } 99 100 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+ 101 /// while making sure there are enough actual digits around for the constant to 102 /// be valid. 103 /// 104 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed 105 /// before we get here. 106 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) { 107 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') && 108 "unexpected parse state in floating hex"); 109 bool NoFracDigits = true; 110 111 // Skip the fractional part if there is one 112 if (*CurPtr == '.') { 113 ++CurPtr; 114 115 const char *FracStart = CurPtr; 116 while (isHexDigit(*CurPtr)) 117 ++CurPtr; 118 119 NoFracDigits = CurPtr == FracStart; 120 } 121 122 if (NoIntDigits && NoFracDigits) 123 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 124 "expected at least one significand digit"); 125 126 // Make sure we do have some kind of proper exponent part 127 if (*CurPtr != 'p' && *CurPtr != 'P') 128 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 129 "expected exponent part 'p'"); 130 ++CurPtr; 131 132 if (*CurPtr == '+' || *CurPtr == '-') 133 ++CurPtr; 134 135 // N.b. exponent digits are *not* hex 136 const char *ExpStart = CurPtr; 137 while (isDigit(*CurPtr)) 138 ++CurPtr; 139 140 if (CurPtr == ExpStart) 141 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 142 "expected at least one exponent digit"); 143 144 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); 145 } 146 147 /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]* 148 static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) { 149 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' || 150 (AllowAt && C == '@') || (AllowHash && C == '#'); 151 } 152 153 AsmToken AsmLexer::LexIdentifier() { 154 // Check for floating point literals. 155 if (CurPtr[-1] == '.' && isDigit(*CurPtr)) { 156 // Disambiguate a .1243foo identifier from a floating literal. 157 while (isDigit(*CurPtr)) 158 ++CurPtr; 159 160 if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier, 161 AllowHashInIdentifier) || 162 *CurPtr == 'e' || *CurPtr == 'E') 163 return LexFloatLiteral(); 164 } 165 166 while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier)) 167 ++CurPtr; 168 169 // Handle . as a special case. 170 if (CurPtr == TokStart+1 && TokStart[0] == '.') 171 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 172 173 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 174 } 175 176 /// LexSlash: Slash: / 177 /// C-Style Comment: /* ... */ 178 /// C-style Comment: // ... 179 AsmToken AsmLexer::LexSlash() { 180 if (!MAI.shouldAllowAdditionalComments()) { 181 IsAtStartOfStatement = false; 182 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1)); 183 } 184 185 switch (*CurPtr) { 186 case '*': 187 IsAtStartOfStatement = false; 188 break; // C style comment. 189 case '/': 190 ++CurPtr; 191 return LexLineComment(); 192 default: 193 IsAtStartOfStatement = false; 194 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1)); 195 } 196 197 // C Style comment. 198 ++CurPtr; // skip the star. 199 const char *CommentTextStart = CurPtr; 200 while (CurPtr != CurBuf.end()) { 201 switch (*CurPtr++) { 202 case '*': 203 // End of the comment? 204 if (*CurPtr != '/') 205 break; 206 // If we have a CommentConsumer, notify it about the comment. 207 if (CommentConsumer) { 208 CommentConsumer->HandleComment( 209 SMLoc::getFromPointer(CommentTextStart), 210 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart)); 211 } 212 ++CurPtr; // End the */. 213 return AsmToken(AsmToken::Comment, 214 StringRef(TokStart, CurPtr - TokStart)); 215 } 216 } 217 return ReturnError(TokStart, "unterminated comment"); 218 } 219 220 /// LexLineComment: Comment: #[^\n]* 221 /// : //[^\n]* 222 AsmToken AsmLexer::LexLineComment() { 223 // Mark This as an end of statement with a body of the 224 // comment. While it would be nicer to leave this two tokens, 225 // backwards compatability with TargetParsers makes keeping this in this form 226 // better. 227 const char *CommentTextStart = CurPtr; 228 int CurChar = getNextChar(); 229 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) 230 CurChar = getNextChar(); 231 const char *NewlinePtr = CurPtr; 232 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n') 233 ++CurPtr; 234 235 // If we have a CommentConsumer, notify it about the comment. 236 if (CommentConsumer) { 237 CommentConsumer->HandleComment( 238 SMLoc::getFromPointer(CommentTextStart), 239 StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart)); 240 } 241 242 IsAtStartOfLine = true; 243 // This is a whole line comment. leave newline 244 if (IsAtStartOfStatement) 245 return AsmToken(AsmToken::EndOfStatement, 246 StringRef(TokStart, CurPtr - TokStart)); 247 IsAtStartOfStatement = true; 248 249 return AsmToken(AsmToken::EndOfStatement, 250 StringRef(TokStart, CurPtr - 1 - TokStart)); 251 } 252 253 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 254 // Skip case-insensitive ULL, UL, U, L and LL suffixes. 255 if (CurPtr[0] == 'U' || CurPtr[0] == 'u') 256 ++CurPtr; 257 if (CurPtr[0] == 'L' || CurPtr[0] == 'l') 258 ++CurPtr; 259 if (CurPtr[0] == 'L' || CurPtr[0] == 'l') 260 ++CurPtr; 261 } 262 263 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the 264 // integer as a hexadecimal, possibly with leading zeroes. 265 static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix, 266 bool LexHex) { 267 const char *FirstNonDec = nullptr; 268 const char *LookAhead = CurPtr; 269 while (true) { 270 if (isDigit(*LookAhead)) { 271 ++LookAhead; 272 } else { 273 if (!FirstNonDec) 274 FirstNonDec = LookAhead; 275 276 // Keep going if we are looking for a 'h' suffix. 277 if (LexHex && isHexDigit(*LookAhead)) 278 ++LookAhead; 279 else 280 break; 281 } 282 } 283 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H'); 284 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec; 285 if (isHex) 286 return 16; 287 return DefaultRadix; 288 } 289 290 static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) { 291 while (hexDigitValue(*CurPtr) < DefaultRadix) { 292 ++CurPtr; 293 } 294 return CurPtr; 295 } 296 297 static AsmToken intToken(StringRef Ref, APInt &Value) { 298 if (Value.isIntN(64)) 299 return AsmToken(AsmToken::Integer, Ref, Value); 300 return AsmToken(AsmToken::BigNum, Ref, Value); 301 } 302 303 static std::string radixName(unsigned Radix) { 304 switch (Radix) { 305 case 2: 306 return "binary"; 307 case 8: 308 return "octal"; 309 case 10: 310 return "decimal"; 311 case 16: 312 return "hexadecimal"; 313 default: 314 return "base-" + std::to_string(Radix); 315 } 316 } 317 318 /// LexDigit: First character is [0-9]. 319 /// Local Label: [0-9][:] 320 /// Forward/Backward Label: [0-9][fb] 321 /// Binary integer: 0b[01]+ 322 /// Octal integer: 0[0-7]+ 323 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH] 324 /// Decimal integer: [1-9][0-9]* 325 AsmToken AsmLexer::LexDigit() { 326 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY]) 327 // MASM-flavor octal integer: [0-7]+[oOqQ] 328 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT]) 329 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH] 330 if (LexMasmIntegers && isdigit(CurPtr[-1])) { 331 const char *FirstNonBinary = 332 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr; 333 const char *FirstNonDecimal = 334 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr; 335 const char *OldCurPtr = CurPtr; 336 while (isHexDigit(*CurPtr)) { 337 switch (*CurPtr) { 338 default: 339 if (!FirstNonDecimal) { 340 FirstNonDecimal = CurPtr; 341 } 342 LLVM_FALLTHROUGH; 343 case '9': 344 case '8': 345 case '7': 346 case '6': 347 case '5': 348 case '4': 349 case '3': 350 case '2': 351 if (!FirstNonBinary) { 352 FirstNonBinary = CurPtr; 353 } 354 break; 355 case '1': 356 case '0': 357 break; 358 } 359 ++CurPtr; 360 } 361 if (*CurPtr == '.') { 362 // MASM float literals (other than hex floats) always contain a ".", and 363 // are always written in decimal. 364 ++CurPtr; 365 return LexFloatLiteral(); 366 } 367 368 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) { 369 ++CurPtr; 370 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); 371 } 372 373 unsigned Radix = 0; 374 if (*CurPtr == 'h' || *CurPtr == 'H') { 375 // hexadecimal number 376 ++CurPtr; 377 Radix = 16; 378 } else if (*CurPtr == 't' || *CurPtr == 'T') { 379 // decimal number 380 ++CurPtr; 381 Radix = 10; 382 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' || 383 *CurPtr == 'Q') { 384 // octal number 385 ++CurPtr; 386 Radix = 8; 387 } else if (*CurPtr == 'y' || *CurPtr == 'Y') { 388 // binary number 389 ++CurPtr; 390 Radix = 2; 391 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr && 392 DefaultRadix < 14 && 393 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) { 394 Radix = 10; 395 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr && 396 DefaultRadix < 12 && 397 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) { 398 Radix = 2; 399 } 400 401 if (Radix) { 402 StringRef Result(TokStart, CurPtr - TokStart); 403 APInt Value(128, 0, true); 404 405 if (Result.drop_back().getAsInteger(Radix, Value)) 406 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); 407 408 // MSVC accepts and ignores type suffices on integer literals. 409 SkipIgnoredIntegerSuffix(CurPtr); 410 411 return intToken(Result, Value); 412 } 413 414 // default-radix integers, or floating point numbers, fall through 415 CurPtr = OldCurPtr; 416 } 417 418 // MASM default-radix integers: [0-9a-fA-F]+ 419 // (All other integer literals have a radix specifier.) 420 if (LexMasmIntegers && UseMasmDefaultRadix) { 421 CurPtr = findLastDigit(CurPtr, 16); 422 StringRef Result(TokStart, CurPtr - TokStart); 423 424 APInt Value(128, 0, true); 425 if (Result.getAsInteger(DefaultRadix, Value)) { 426 return ReturnError(TokStart, 427 "invalid " + radixName(DefaultRadix) + " number"); 428 } 429 430 return intToken(Result, Value); 431 } 432 433 // Motorola hex integers: $[0-9a-fA-F]+ 434 if (LexMotorolaIntegers && CurPtr[-1] == '$') { 435 const char *NumStart = CurPtr; 436 while (isHexDigit(CurPtr[0])) 437 ++CurPtr; 438 439 APInt Result(128, 0); 440 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result)) 441 return ReturnError(TokStart, "invalid hexadecimal number"); 442 443 return intToken(StringRef(TokStart, CurPtr - TokStart), Result); 444 } 445 446 // Motorola binary integers: %[01]+ 447 if (LexMotorolaIntegers && CurPtr[-1] == '%') { 448 const char *NumStart = CurPtr; 449 while (*CurPtr == '0' || *CurPtr == '1') 450 ++CurPtr; 451 452 APInt Result(128, 0); 453 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result)) 454 return ReturnError(TokStart, "invalid binary number"); 455 456 return intToken(StringRef(TokStart, CurPtr - TokStart), Result); 457 } 458 459 // Decimal integer: [1-9][0-9]* 460 // HLASM-flavour decimal integer: [0-9][0-9]* 461 // FIXME: Later on, support for fb for HLASM has to be added in 462 // as they probably would be needed for asm goto 463 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') { 464 unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers); 465 466 if (!LexHLASMIntegers) { 467 bool IsHex = Radix == 16; 468 // Check for floating point literals. 469 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) { 470 if (*CurPtr == '.') 471 ++CurPtr; 472 return LexFloatLiteral(); 473 } 474 } 475 476 StringRef Result(TokStart, CurPtr - TokStart); 477 478 APInt Value(128, 0, true); 479 if (Result.getAsInteger(Radix, Value)) 480 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); 481 482 if (!LexHLASMIntegers) 483 // The darwin/x86 (and x86-64) assembler accepts and ignores type 484 // suffices on integer literals. 485 SkipIgnoredIntegerSuffix(CurPtr); 486 487 return intToken(Result, Value); 488 } 489 490 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) { 491 ++CurPtr; 492 // See if we actually have "0b" as part of something like "jmp 0b\n" 493 if (!isDigit(CurPtr[0])) { 494 --CurPtr; 495 StringRef Result(TokStart, CurPtr - TokStart); 496 return AsmToken(AsmToken::Integer, Result, 0); 497 } 498 const char *NumStart = CurPtr; 499 while (CurPtr[0] == '0' || CurPtr[0] == '1') 500 ++CurPtr; 501 502 // Requires at least one binary digit. 503 if (CurPtr == NumStart) 504 return ReturnError(TokStart, "invalid binary number"); 505 506 StringRef Result(TokStart, CurPtr - TokStart); 507 508 APInt Value(128, 0, true); 509 if (Result.substr(2).getAsInteger(2, Value)) 510 return ReturnError(TokStart, "invalid binary number"); 511 512 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 513 // suffixes on integer literals. 514 SkipIgnoredIntegerSuffix(CurPtr); 515 516 return intToken(Result, Value); 517 } 518 519 if ((*CurPtr == 'x') || (*CurPtr == 'X')) { 520 ++CurPtr; 521 const char *NumStart = CurPtr; 522 while (isHexDigit(CurPtr[0])) 523 ++CurPtr; 524 525 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be 526 // diagnosed by LexHexFloatLiteral). 527 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P') 528 return LexHexFloatLiteral(NumStart == CurPtr); 529 530 // Otherwise requires at least one hex digit. 531 if (CurPtr == NumStart) 532 return ReturnError(CurPtr-2, "invalid hexadecimal number"); 533 534 APInt Result(128, 0); 535 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 536 return ReturnError(TokStart, "invalid hexadecimal number"); 537 538 // Consume the optional [hH]. 539 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H')) 540 ++CurPtr; 541 542 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 543 // suffixes on integer literals. 544 SkipIgnoredIntegerSuffix(CurPtr); 545 546 return intToken(StringRef(TokStart, CurPtr - TokStart), Result); 547 } 548 549 // Either octal or hexadecimal. 550 APInt Value(128, 0, true); 551 unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers); 552 StringRef Result(TokStart, CurPtr - TokStart); 553 if (Result.getAsInteger(Radix, Value)) 554 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); 555 556 // Consume the [hH]. 557 if (Radix == 16) 558 ++CurPtr; 559 560 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 561 // suffixes on integer literals. 562 SkipIgnoredIntegerSuffix(CurPtr); 563 564 return intToken(Result, Value); 565 } 566 567 /// LexSingleQuote: Integer: 'b' 568 AsmToken AsmLexer::LexSingleQuote() { 569 int CurChar = getNextChar(); 570 571 if (LexHLASMStrings) 572 return ReturnError(TokStart, "invalid usage of character literals"); 573 574 if (LexMasmStrings) { 575 while (CurChar != EOF) { 576 if (CurChar != '\'') { 577 CurChar = getNextChar(); 578 } else if (peekNextChar() == '\'') { 579 // In MASM single-quote strings, doubled single-quotes mean an escaped 580 // single quote, so should be lexed in. 581 getNextChar(); 582 CurChar = getNextChar(); 583 } else { 584 break; 585 } 586 } 587 if (CurChar == EOF) 588 return ReturnError(TokStart, "unterminated string constant"); 589 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 590 } 591 592 if (CurChar == '\\') 593 CurChar = getNextChar(); 594 595 if (CurChar == EOF) 596 return ReturnError(TokStart, "unterminated single quote"); 597 598 CurChar = getNextChar(); 599 600 if (CurChar != '\'') 601 return ReturnError(TokStart, "single quote way too long"); 602 603 // The idea here being that 'c' is basically just an integral 604 // constant. 605 StringRef Res = StringRef(TokStart,CurPtr - TokStart); 606 long long Value; 607 608 if (Res.startswith("\'\\")) { 609 char theChar = Res[2]; 610 switch (theChar) { 611 default: Value = theChar; break; 612 case '\'': Value = '\''; break; 613 case 't': Value = '\t'; break; 614 case 'n': Value = '\n'; break; 615 case 'b': Value = '\b'; break; 616 case 'f': Value = '\f'; break; 617 case 'r': Value = '\r'; break; 618 } 619 } else 620 Value = TokStart[1]; 621 622 return AsmToken(AsmToken::Integer, Res, Value); 623 } 624 625 /// LexQuote: String: "..." 626 AsmToken AsmLexer::LexQuote() { 627 int CurChar = getNextChar(); 628 if (LexHLASMStrings) 629 return ReturnError(TokStart, "invalid usage of string literals"); 630 631 if (LexMasmStrings) { 632 while (CurChar != EOF) { 633 if (CurChar != '"') { 634 CurChar = getNextChar(); 635 } else if (peekNextChar() == '"') { 636 // In MASM double-quoted strings, doubled double-quotes mean an escaped 637 // double quote, so should be lexed in. 638 getNextChar(); 639 CurChar = getNextChar(); 640 } else { 641 break; 642 } 643 } 644 if (CurChar == EOF) 645 return ReturnError(TokStart, "unterminated string constant"); 646 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 647 } 648 649 // TODO: does gas allow multiline string constants? 650 while (CurChar != '"') { 651 if (CurChar == '\\') { 652 // Allow \", etc. 653 CurChar = getNextChar(); 654 } 655 656 if (CurChar == EOF) 657 return ReturnError(TokStart, "unterminated string constant"); 658 659 CurChar = getNextChar(); 660 } 661 662 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 663 } 664 665 StringRef AsmLexer::LexUntilEndOfStatement() { 666 TokStart = CurPtr; 667 668 while (!isAtStartOfComment(CurPtr) && // Start of line comment. 669 !isAtStatementSeparator(CurPtr) && // End of statement marker. 670 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { 671 ++CurPtr; 672 } 673 return StringRef(TokStart, CurPtr-TokStart); 674 } 675 676 StringRef AsmLexer::LexUntilEndOfLine() { 677 TokStart = CurPtr; 678 679 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { 680 ++CurPtr; 681 } 682 return StringRef(TokStart, CurPtr-TokStart); 683 } 684 685 size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf, 686 bool ShouldSkipSpace) { 687 SaveAndRestore<const char *> SavedTokenStart(TokStart); 688 SaveAndRestore<const char *> SavedCurPtr(CurPtr); 689 SaveAndRestore<bool> SavedAtStartOfLine(IsAtStartOfLine); 690 SaveAndRestore<bool> SavedAtStartOfStatement(IsAtStartOfStatement); 691 SaveAndRestore<bool> SavedSkipSpace(SkipSpace, ShouldSkipSpace); 692 SaveAndRestore<bool> SavedIsPeeking(IsPeeking, true); 693 std::string SavedErr = getErr(); 694 SMLoc SavedErrLoc = getErrLoc(); 695 696 size_t ReadCount; 697 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) { 698 AsmToken Token = LexToken(); 699 700 Buf[ReadCount] = Token; 701 702 if (Token.is(AsmToken::Eof)) 703 break; 704 } 705 706 SetError(SavedErrLoc, SavedErr); 707 return ReadCount; 708 } 709 710 bool AsmLexer::isAtStartOfComment(const char *Ptr) { 711 if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement) 712 return false; 713 714 StringRef CommentString = MAI.getCommentString(); 715 716 if (CommentString.size() == 1) 717 return CommentString[0] == Ptr[0]; 718 719 // Allow # preprocessor commments also be counted as comments for "##" cases 720 if (CommentString[1] == '#') 721 return CommentString[0] == Ptr[0]; 722 723 return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0; 724 } 725 726 bool AsmLexer::isAtStatementSeparator(const char *Ptr) { 727 return strncmp(Ptr, MAI.getSeparatorString(), 728 strlen(MAI.getSeparatorString())) == 0; 729 } 730 731 AsmToken AsmLexer::LexToken() { 732 TokStart = CurPtr; 733 // This always consumes at least one character. 734 int CurChar = getNextChar(); 735 736 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) { 737 // If this starts with a '#', this may be a cpp 738 // hash directive and otherwise a line comment. 739 AsmToken TokenBuf[2]; 740 MutableArrayRef<AsmToken> Buf(TokenBuf, 2); 741 size_t num = peekTokens(Buf, true); 742 // There cannot be a space preceding this 743 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) && 744 TokenBuf[1].is(AsmToken::String)) { 745 CurPtr = TokStart; // reset curPtr; 746 StringRef s = LexUntilEndOfLine(); 747 UnLex(TokenBuf[1]); 748 UnLex(TokenBuf[0]); 749 return AsmToken(AsmToken::HashDirective, s); 750 } 751 752 if (MAI.shouldAllowAdditionalComments()) 753 return LexLineComment(); 754 } 755 756 if (isAtStartOfComment(TokStart)) 757 return LexLineComment(); 758 759 if (isAtStatementSeparator(TokStart)) { 760 CurPtr += strlen(MAI.getSeparatorString()) - 1; 761 IsAtStartOfLine = true; 762 IsAtStartOfStatement = true; 763 return AsmToken(AsmToken::EndOfStatement, 764 StringRef(TokStart, strlen(MAI.getSeparatorString()))); 765 } 766 767 // If we're missing a newline at EOF, make sure we still get an 768 // EndOfStatement token before the Eof token. 769 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) { 770 IsAtStartOfLine = true; 771 IsAtStartOfStatement = true; 772 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0)); 773 } 774 IsAtStartOfLine = false; 775 bool OldIsAtStartOfStatement = IsAtStartOfStatement; 776 IsAtStartOfStatement = false; 777 switch (CurChar) { 778 default: 779 // Handle identifier: [a-zA-Z_.?][a-zA-Z0-9_$.@#?]* 780 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.' || 781 (MAI.doesAllowQuestionAtStartOfIdentifier() && CurChar == '?')) 782 return LexIdentifier(); 783 784 // Unknown character, emit an error. 785 return ReturnError(TokStart, "invalid character in input"); 786 case EOF: 787 if (EndStatementAtEOF) { 788 IsAtStartOfLine = true; 789 IsAtStartOfStatement = true; 790 } 791 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 792 case 0: 793 case ' ': 794 case '\t': 795 IsAtStartOfStatement = OldIsAtStartOfStatement; 796 while (*CurPtr == ' ' || *CurPtr == '\t') 797 CurPtr++; 798 if (SkipSpace) 799 return LexToken(); // Ignore whitespace. 800 else 801 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart)); 802 case '\r': { 803 IsAtStartOfLine = true; 804 IsAtStartOfStatement = true; 805 // If this is a CR followed by LF, treat that as one token. 806 if (CurPtr != CurBuf.end() && *CurPtr == '\n') 807 ++CurPtr; 808 return AsmToken(AsmToken::EndOfStatement, 809 StringRef(TokStart, CurPtr - TokStart)); 810 } 811 case '\n': 812 IsAtStartOfLine = true; 813 IsAtStartOfStatement = true; 814 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 815 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 816 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 817 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 818 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 819 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 820 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 821 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 822 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 823 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 824 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 825 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 826 case '$': { 827 if (LexMotorolaIntegers && isHexDigit(*CurPtr)) 828 return LexDigit(); 829 if (MAI.doesAllowDollarAtStartOfIdentifier()) 830 return LexIdentifier(); 831 return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 832 } 833 case '@': { 834 if (MAI.doesAllowAtAtStartOfIdentifier()) 835 return LexIdentifier(); 836 return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 837 } 838 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); 839 case '=': 840 if (*CurPtr == '=') { 841 ++CurPtr; 842 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 843 } 844 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 845 case '-': 846 if (*CurPtr == '>') { 847 ++CurPtr; 848 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2)); 849 } 850 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 851 case '|': 852 if (*CurPtr == '|') { 853 ++CurPtr; 854 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 855 } 856 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 857 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 858 case '&': 859 if (*CurPtr == '&') { 860 ++CurPtr; 861 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 862 } 863 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 864 case '!': 865 if (*CurPtr == '=') { 866 ++CurPtr; 867 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 868 } 869 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 870 case '%': 871 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) { 872 return LexDigit(); 873 } 874 875 if (MAI.hasMipsExpressions()) { 876 AsmToken::TokenKind Operator; 877 unsigned OperatorLength; 878 879 std::tie(Operator, OperatorLength) = 880 StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>( 881 StringRef(CurPtr)) 882 .StartsWith("call16", {AsmToken::PercentCall16, 7}) 883 .StartsWith("call_hi", {AsmToken::PercentCall_Hi, 8}) 884 .StartsWith("call_lo", {AsmToken::PercentCall_Lo, 8}) 885 .StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi, 10}) 886 .StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo, 10}) 887 .StartsWith("got_disp", {AsmToken::PercentGot_Disp, 9}) 888 .StartsWith("got_hi", {AsmToken::PercentGot_Hi, 7}) 889 .StartsWith("got_lo", {AsmToken::PercentGot_Lo, 7}) 890 .StartsWith("got_ofst", {AsmToken::PercentGot_Ofst, 9}) 891 .StartsWith("got_page", {AsmToken::PercentGot_Page, 9}) 892 .StartsWith("gottprel", {AsmToken::PercentGottprel, 9}) 893 .StartsWith("got", {AsmToken::PercentGot, 4}) 894 .StartsWith("gp_rel", {AsmToken::PercentGp_Rel, 7}) 895 .StartsWith("higher", {AsmToken::PercentHigher, 7}) 896 .StartsWith("highest", {AsmToken::PercentHighest, 8}) 897 .StartsWith("hi", {AsmToken::PercentHi, 3}) 898 .StartsWith("lo", {AsmToken::PercentLo, 3}) 899 .StartsWith("neg", {AsmToken::PercentNeg, 4}) 900 .StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi, 9}) 901 .StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo, 9}) 902 .StartsWith("tlsgd", {AsmToken::PercentTlsgd, 6}) 903 .StartsWith("tlsldm", {AsmToken::PercentTlsldm, 7}) 904 .StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi, 9}) 905 .StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo, 9}) 906 .Default({AsmToken::Percent, 1}); 907 908 if (Operator != AsmToken::Percent) { 909 CurPtr += OperatorLength - 1; 910 return AsmToken(Operator, StringRef(TokStart, OperatorLength)); 911 } 912 } 913 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 914 case '/': 915 IsAtStartOfStatement = OldIsAtStartOfStatement; 916 return LexSlash(); 917 case '#': { 918 if (MAI.doesAllowHashAtStartOfIdentifier()) 919 return LexIdentifier(); 920 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 921 } 922 case '\'': return LexSingleQuote(); 923 case '"': return LexQuote(); 924 case '0': case '1': case '2': case '3': case '4': 925 case '5': case '6': case '7': case '8': case '9': 926 return LexDigit(); 927 case '<': 928 switch (*CurPtr) { 929 case '<': 930 ++CurPtr; 931 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2)); 932 case '=': 933 ++CurPtr; 934 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2)); 935 case '>': 936 ++CurPtr; 937 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2)); 938 default: 939 return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 940 } 941 case '>': 942 switch (*CurPtr) { 943 case '>': 944 ++CurPtr; 945 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2)); 946 case '=': 947 ++CurPtr; 948 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2)); 949 default: 950 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 951 } 952 953 // TODO: Quoted identifiers (objc methods etc) 954 // local labels: [0-9][:] 955 // Forward/backward labels: [0-9][fb] 956 // Integers, fp constants, character constants. 957 } 958 } 959