1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This class implements the lexer for assembly files. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/MC/MCParser/AsmLexer.h" 14 #include "llvm/ADT/APInt.h" 15 #include "llvm/ADT/ArrayRef.h" 16 #include "llvm/ADT/StringExtras.h" 17 #include "llvm/ADT/StringRef.h" 18 #include "llvm/ADT/StringSwitch.h" 19 #include "llvm/MC/MCAsmInfo.h" 20 #include "llvm/MC/MCParser/MCAsmLexer.h" 21 #include "llvm/Support/Compiler.h" 22 #include "llvm/Support/SMLoc.h" 23 #include "llvm/Support/SaveAndRestore.h" 24 #include <cassert> 25 #include <cctype> 26 #include <cstdio> 27 #include <cstring> 28 #include <string> 29 #include <tuple> 30 #include <utility> 31 32 using namespace llvm; 33 34 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) { 35 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with("@"); 36 LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers(); 37 } 38 39 AsmLexer::~AsmLexer() = default; 40 41 void AsmLexer::setBuffer(StringRef Buf, const char *ptr, 42 bool EndStatementAtEOF) { 43 CurBuf = Buf; 44 45 if (ptr) 46 CurPtr = ptr; 47 else 48 CurPtr = CurBuf.begin(); 49 50 TokStart = nullptr; 51 this->EndStatementAtEOF = EndStatementAtEOF; 52 } 53 54 /// ReturnError - Set the error to the specified string at the specified 55 /// location. This is defined to always return AsmToken::Error. 56 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 57 SetError(SMLoc::getFromPointer(Loc), Msg); 58 59 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc)); 60 } 61 62 int AsmLexer::getNextChar() { 63 if (CurPtr == CurBuf.end()) 64 return EOF; 65 return (unsigned char)*CurPtr++; 66 } 67 68 int AsmLexer::peekNextChar() { 69 if (CurPtr == CurBuf.end()) 70 return EOF; 71 return (unsigned char)*CurPtr; 72 } 73 74 /// The leading integral digit sequence and dot should have already been 75 /// consumed, some or all of the fractional digit sequence *can* have been 76 /// consumed. 77 AsmToken AsmLexer::LexFloatLiteral() { 78 // Skip the fractional digit sequence. 79 while (isDigit(*CurPtr)) 80 ++CurPtr; 81 82 if (*CurPtr == '-' || *CurPtr == '+') 83 return ReturnError(CurPtr, "invalid sign in float literal"); 84 85 // Check for exponent 86 if ((*CurPtr == 'e' || *CurPtr == 'E')) { 87 ++CurPtr; 88 89 if (*CurPtr == '-' || *CurPtr == '+') 90 ++CurPtr; 91 92 while (isDigit(*CurPtr)) 93 ++CurPtr; 94 } 95 96 return AsmToken(AsmToken::Real, 97 StringRef(TokStart, CurPtr - TokStart)); 98 } 99 100 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+ 101 /// while making sure there are enough actual digits around for the constant to 102 /// be valid. 103 /// 104 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed 105 /// before we get here. 106 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) { 107 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') && 108 "unexpected parse state in floating hex"); 109 bool NoFracDigits = true; 110 111 // Skip the fractional part if there is one 112 if (*CurPtr == '.') { 113 ++CurPtr; 114 115 const char *FracStart = CurPtr; 116 while (isHexDigit(*CurPtr)) 117 ++CurPtr; 118 119 NoFracDigits = CurPtr == FracStart; 120 } 121 122 if (NoIntDigits && NoFracDigits) 123 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 124 "expected at least one significand digit"); 125 126 // Make sure we do have some kind of proper exponent part 127 if (*CurPtr != 'p' && *CurPtr != 'P') 128 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 129 "expected exponent part 'p'"); 130 ++CurPtr; 131 132 if (*CurPtr == '+' || *CurPtr == '-') 133 ++CurPtr; 134 135 // N.b. exponent digits are *not* hex 136 const char *ExpStart = CurPtr; 137 while (isDigit(*CurPtr)) 138 ++CurPtr; 139 140 if (CurPtr == ExpStart) 141 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 142 "expected at least one exponent digit"); 143 144 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); 145 } 146 147 /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]* 148 static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) { 149 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' || 150 (AllowAt && C == '@') || (AllowHash && C == '#'); 151 } 152 153 AsmToken AsmLexer::LexIdentifier() { 154 // Check for floating point literals. 155 if (CurPtr[-1] == '.' && isDigit(*CurPtr)) { 156 // Disambiguate a .1243foo identifier from a floating literal. 157 while (isDigit(*CurPtr)) 158 ++CurPtr; 159 160 if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier, 161 AllowHashInIdentifier) || 162 *CurPtr == 'e' || *CurPtr == 'E') 163 return LexFloatLiteral(); 164 } 165 166 while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier)) 167 ++CurPtr; 168 169 // Handle . as a special case. 170 if (CurPtr == TokStart+1 && TokStart[0] == '.') 171 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 172 173 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 174 } 175 176 /// LexSlash: Slash: / 177 /// C-Style Comment: /* ... */ 178 /// C-style Comment: // ... 179 AsmToken AsmLexer::LexSlash() { 180 if (!MAI.shouldAllowAdditionalComments()) { 181 IsAtStartOfStatement = false; 182 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1)); 183 } 184 185 switch (*CurPtr) { 186 case '*': 187 IsAtStartOfStatement = false; 188 break; // C style comment. 189 case '/': 190 ++CurPtr; 191 return LexLineComment(); 192 default: 193 IsAtStartOfStatement = false; 194 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1)); 195 } 196 197 // C Style comment. 198 ++CurPtr; // skip the star. 199 const char *CommentTextStart = CurPtr; 200 while (CurPtr != CurBuf.end()) { 201 switch (*CurPtr++) { 202 case '*': 203 // End of the comment? 204 if (*CurPtr != '/') 205 break; 206 // If we have a CommentConsumer, notify it about the comment. 207 if (CommentConsumer) { 208 CommentConsumer->HandleComment( 209 SMLoc::getFromPointer(CommentTextStart), 210 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart)); 211 } 212 ++CurPtr; // End the */. 213 return AsmToken(AsmToken::Comment, 214 StringRef(TokStart, CurPtr - TokStart)); 215 } 216 } 217 return ReturnError(TokStart, "unterminated comment"); 218 } 219 220 /// LexLineComment: Comment: #[^\n]* 221 /// : //[^\n]* 222 AsmToken AsmLexer::LexLineComment() { 223 // Mark This as an end of statement with a body of the 224 // comment. While it would be nicer to leave this two tokens, 225 // backwards compatability with TargetParsers makes keeping this in this form 226 // better. 227 const char *CommentTextStart = CurPtr; 228 int CurChar = getNextChar(); 229 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) 230 CurChar = getNextChar(); 231 const char *NewlinePtr = CurPtr; 232 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n') 233 ++CurPtr; 234 235 // If we have a CommentConsumer, notify it about the comment. 236 if (CommentConsumer) { 237 CommentConsumer->HandleComment( 238 SMLoc::getFromPointer(CommentTextStart), 239 StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart)); 240 } 241 242 IsAtStartOfLine = true; 243 // This is a whole line comment. leave newline 244 if (IsAtStartOfStatement) 245 return AsmToken(AsmToken::EndOfStatement, 246 StringRef(TokStart, CurPtr - TokStart)); 247 IsAtStartOfStatement = true; 248 249 return AsmToken(AsmToken::EndOfStatement, 250 StringRef(TokStart, CurPtr - 1 - TokStart)); 251 } 252 253 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 254 // Skip case-insensitive ULL, UL, U, L and LL suffixes. 255 if (CurPtr[0] == 'U' || CurPtr[0] == 'u') 256 ++CurPtr; 257 if (CurPtr[0] == 'L' || CurPtr[0] == 'l') 258 ++CurPtr; 259 if (CurPtr[0] == 'L' || CurPtr[0] == 'l') 260 ++CurPtr; 261 } 262 263 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the 264 // integer as a hexadecimal, possibly with leading zeroes. 265 static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix, 266 bool LexHex) { 267 const char *FirstNonDec = nullptr; 268 const char *LookAhead = CurPtr; 269 while (true) { 270 if (isDigit(*LookAhead)) { 271 ++LookAhead; 272 } else { 273 if (!FirstNonDec) 274 FirstNonDec = LookAhead; 275 276 // Keep going if we are looking for a 'h' suffix. 277 if (LexHex && isHexDigit(*LookAhead)) 278 ++LookAhead; 279 else 280 break; 281 } 282 } 283 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H'); 284 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec; 285 if (isHex) 286 return 16; 287 return DefaultRadix; 288 } 289 290 static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) { 291 while (hexDigitValue(*CurPtr) < DefaultRadix) { 292 ++CurPtr; 293 } 294 return CurPtr; 295 } 296 297 static AsmToken intToken(StringRef Ref, APInt &Value) { 298 if (Value.isIntN(64)) 299 return AsmToken(AsmToken::Integer, Ref, Value); 300 return AsmToken(AsmToken::BigNum, Ref, Value); 301 } 302 303 static std::string radixName(unsigned Radix) { 304 switch (Radix) { 305 case 2: 306 return "binary"; 307 case 8: 308 return "octal"; 309 case 10: 310 return "decimal"; 311 case 16: 312 return "hexadecimal"; 313 default: 314 return "base-" + std::to_string(Radix); 315 } 316 } 317 318 /// LexDigit: First character is [0-9]. 319 /// Local Label: [0-9][:] 320 /// Forward/Backward Label: [0-9][fb] 321 /// Binary integer: 0b[01]+ 322 /// Octal integer: 0[0-7]+ 323 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH] 324 /// Decimal integer: [1-9][0-9]* 325 AsmToken AsmLexer::LexDigit() { 326 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY]) 327 // MASM-flavor octal integer: [0-7]+[oOqQ] 328 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT]) 329 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH] 330 if (LexMasmIntegers && isdigit(CurPtr[-1])) { 331 const char *FirstNonBinary = 332 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr; 333 const char *FirstNonDecimal = 334 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr; 335 const char *OldCurPtr = CurPtr; 336 while (isHexDigit(*CurPtr)) { 337 switch (*CurPtr) { 338 default: 339 if (!FirstNonDecimal) { 340 FirstNonDecimal = CurPtr; 341 } 342 [[fallthrough]]; 343 case '9': 344 case '8': 345 case '7': 346 case '6': 347 case '5': 348 case '4': 349 case '3': 350 case '2': 351 if (!FirstNonBinary) { 352 FirstNonBinary = CurPtr; 353 } 354 break; 355 case '1': 356 case '0': 357 break; 358 } 359 ++CurPtr; 360 } 361 if (*CurPtr == '.') { 362 // MASM float literals (other than hex floats) always contain a ".", and 363 // are always written in decimal. 364 ++CurPtr; 365 return LexFloatLiteral(); 366 } 367 368 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) { 369 ++CurPtr; 370 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); 371 } 372 373 unsigned Radix = 0; 374 if (*CurPtr == 'h' || *CurPtr == 'H') { 375 // hexadecimal number 376 ++CurPtr; 377 Radix = 16; 378 } else if (*CurPtr == 't' || *CurPtr == 'T') { 379 // decimal number 380 ++CurPtr; 381 Radix = 10; 382 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' || 383 *CurPtr == 'Q') { 384 // octal number 385 ++CurPtr; 386 Radix = 8; 387 } else if (*CurPtr == 'y' || *CurPtr == 'Y') { 388 // binary number 389 ++CurPtr; 390 Radix = 2; 391 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr && 392 DefaultRadix < 14 && 393 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) { 394 Radix = 10; 395 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr && 396 DefaultRadix < 12 && 397 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) { 398 Radix = 2; 399 } 400 401 if (Radix) { 402 StringRef Result(TokStart, CurPtr - TokStart); 403 APInt Value(128, 0, true); 404 405 if (Result.drop_back().getAsInteger(Radix, Value)) 406 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); 407 408 // MSVC accepts and ignores type suffices on integer literals. 409 SkipIgnoredIntegerSuffix(CurPtr); 410 411 return intToken(Result, Value); 412 } 413 414 // default-radix integers, or floating point numbers, fall through 415 CurPtr = OldCurPtr; 416 } 417 418 // MASM default-radix integers: [0-9a-fA-F]+ 419 // (All other integer literals have a radix specifier.) 420 if (LexMasmIntegers && UseMasmDefaultRadix) { 421 CurPtr = findLastDigit(CurPtr, 16); 422 StringRef Result(TokStart, CurPtr - TokStart); 423 424 APInt Value(128, 0, true); 425 if (Result.getAsInteger(DefaultRadix, Value)) { 426 return ReturnError(TokStart, 427 "invalid " + radixName(DefaultRadix) + " number"); 428 } 429 430 return intToken(Result, Value); 431 } 432 433 // Motorola hex integers: $[0-9a-fA-F]+ 434 if (LexMotorolaIntegers && CurPtr[-1] == '$') { 435 const char *NumStart = CurPtr; 436 while (isHexDigit(CurPtr[0])) 437 ++CurPtr; 438 439 APInt Result(128, 0); 440 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result)) 441 return ReturnError(TokStart, "invalid hexadecimal number"); 442 443 return intToken(StringRef(TokStart, CurPtr - TokStart), Result); 444 } 445 446 // Motorola binary integers: %[01]+ 447 if (LexMotorolaIntegers && CurPtr[-1] == '%') { 448 const char *NumStart = CurPtr; 449 while (*CurPtr == '0' || *CurPtr == '1') 450 ++CurPtr; 451 452 APInt Result(128, 0); 453 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result)) 454 return ReturnError(TokStart, "invalid binary number"); 455 456 return intToken(StringRef(TokStart, CurPtr - TokStart), Result); 457 } 458 459 // Decimal integer: [1-9][0-9]* 460 // HLASM-flavour decimal integer: [0-9][0-9]* 461 // FIXME: Later on, support for fb for HLASM has to be added in 462 // as they probably would be needed for asm goto 463 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') { 464 unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers); 465 466 if (!LexHLASMIntegers) { 467 bool IsHex = Radix == 16; 468 // Check for floating point literals. 469 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) { 470 if (*CurPtr == '.') 471 ++CurPtr; 472 return LexFloatLiteral(); 473 } 474 } 475 476 StringRef Result(TokStart, CurPtr - TokStart); 477 478 APInt Value(128, 0, true); 479 if (Result.getAsInteger(Radix, Value)) 480 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); 481 482 if (!LexHLASMIntegers) 483 // The darwin/x86 (and x86-64) assembler accepts and ignores type 484 // suffices on integer literals. 485 SkipIgnoredIntegerSuffix(CurPtr); 486 487 return intToken(Result, Value); 488 } 489 490 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) { 491 ++CurPtr; 492 // See if we actually have "0b" as part of something like "jmp 0b\n" 493 if (!isDigit(CurPtr[0])) { 494 --CurPtr; 495 StringRef Result(TokStart, CurPtr - TokStart); 496 return AsmToken(AsmToken::Integer, Result, 0); 497 } 498 const char *NumStart = CurPtr; 499 while (CurPtr[0] == '0' || CurPtr[0] == '1') 500 ++CurPtr; 501 502 // Requires at least one binary digit. 503 if (CurPtr == NumStart) 504 return ReturnError(TokStart, "invalid binary number"); 505 506 StringRef Result(TokStart, CurPtr - TokStart); 507 508 APInt Value(128, 0, true); 509 if (Result.substr(2).getAsInteger(2, Value)) 510 return ReturnError(TokStart, "invalid binary number"); 511 512 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 513 // suffixes on integer literals. 514 SkipIgnoredIntegerSuffix(CurPtr); 515 516 return intToken(Result, Value); 517 } 518 519 if ((*CurPtr == 'x') || (*CurPtr == 'X')) { 520 ++CurPtr; 521 const char *NumStart = CurPtr; 522 while (isHexDigit(CurPtr[0])) 523 ++CurPtr; 524 525 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be 526 // diagnosed by LexHexFloatLiteral). 527 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P') 528 return LexHexFloatLiteral(NumStart == CurPtr); 529 530 // Otherwise requires at least one hex digit. 531 if (CurPtr == NumStart) 532 return ReturnError(CurPtr-2, "invalid hexadecimal number"); 533 534 APInt Result(128, 0); 535 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 536 return ReturnError(TokStart, "invalid hexadecimal number"); 537 538 // Consume the optional [hH]. 539 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H')) 540 ++CurPtr; 541 542 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 543 // suffixes on integer literals. 544 SkipIgnoredIntegerSuffix(CurPtr); 545 546 return intToken(StringRef(TokStart, CurPtr - TokStart), Result); 547 } 548 549 // Either octal or hexadecimal. 550 APInt Value(128, 0, true); 551 unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers); 552 StringRef Result(TokStart, CurPtr - TokStart); 553 if (Result.getAsInteger(Radix, Value)) 554 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); 555 556 // Consume the [hH]. 557 if (Radix == 16) 558 ++CurPtr; 559 560 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 561 // suffixes on integer literals. 562 SkipIgnoredIntegerSuffix(CurPtr); 563 564 return intToken(Result, Value); 565 } 566 567 /// LexSingleQuote: Integer: 'b' 568 AsmToken AsmLexer::LexSingleQuote() { 569 int CurChar = getNextChar(); 570 571 if (LexHLASMStrings) 572 return ReturnError(TokStart, "invalid usage of character literals"); 573 574 if (LexMasmStrings) { 575 while (CurChar != EOF) { 576 if (CurChar != '\'') { 577 CurChar = getNextChar(); 578 } else if (peekNextChar() == '\'') { 579 // In MASM single-quote strings, doubled single-quotes mean an escaped 580 // single quote, so should be lexed in. 581 (void)getNextChar(); 582 CurChar = getNextChar(); 583 } else { 584 break; 585 } 586 } 587 if (CurChar == EOF) 588 return ReturnError(TokStart, "unterminated string constant"); 589 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 590 } 591 592 if (CurChar == '\\') 593 CurChar = getNextChar(); 594 595 if (CurChar == EOF) 596 return ReturnError(TokStart, "unterminated single quote"); 597 598 CurChar = getNextChar(); 599 600 if (CurChar != '\'') 601 return ReturnError(TokStart, "single quote way too long"); 602 603 // The idea here being that 'c' is basically just an integral 604 // constant. 605 StringRef Res = StringRef(TokStart,CurPtr - TokStart); 606 long long Value; 607 608 if (Res.starts_with("\'\\")) { 609 char theChar = Res[2]; 610 switch (theChar) { 611 default: Value = theChar; break; 612 case '\'': Value = '\''; break; 613 case 't': Value = '\t'; break; 614 case 'n': Value = '\n'; break; 615 case 'b': Value = '\b'; break; 616 case 'f': Value = '\f'; break; 617 case 'r': Value = '\r'; break; 618 } 619 } else 620 Value = TokStart[1]; 621 622 return AsmToken(AsmToken::Integer, Res, Value); 623 } 624 625 /// LexQuote: String: "..." 626 AsmToken AsmLexer::LexQuote() { 627 int CurChar = getNextChar(); 628 if (LexHLASMStrings) 629 return ReturnError(TokStart, "invalid usage of string literals"); 630 631 if (LexMasmStrings) { 632 while (CurChar != EOF) { 633 if (CurChar != '"') { 634 CurChar = getNextChar(); 635 } else if (peekNextChar() == '"') { 636 // In MASM double-quoted strings, doubled double-quotes mean an escaped 637 // double quote, so should be lexed in. 638 (void)getNextChar(); 639 CurChar = getNextChar(); 640 } else { 641 break; 642 } 643 } 644 if (CurChar == EOF) 645 return ReturnError(TokStart, "unterminated string constant"); 646 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 647 } 648 649 while (CurChar != '"') { 650 if (CurChar == '\\') { 651 // Allow \", etc. 652 CurChar = getNextChar(); 653 } 654 655 if (CurChar == EOF) 656 return ReturnError(TokStart, "unterminated string constant"); 657 658 CurChar = getNextChar(); 659 } 660 661 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 662 } 663 664 StringRef AsmLexer::LexUntilEndOfStatement() { 665 TokStart = CurPtr; 666 667 while (!isAtStartOfComment(CurPtr) && // Start of line comment. 668 !isAtStatementSeparator(CurPtr) && // End of statement marker. 669 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { 670 ++CurPtr; 671 } 672 return StringRef(TokStart, CurPtr-TokStart); 673 } 674 675 StringRef AsmLexer::LexUntilEndOfLine() { 676 TokStart = CurPtr; 677 678 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { 679 ++CurPtr; 680 } 681 return StringRef(TokStart, CurPtr-TokStart); 682 } 683 684 size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf, 685 bool ShouldSkipSpace) { 686 SaveAndRestore SavedTokenStart(TokStart); 687 SaveAndRestore SavedCurPtr(CurPtr); 688 SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine); 689 SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement); 690 SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace); 691 SaveAndRestore SavedIsPeeking(IsPeeking, true); 692 std::string SavedErr = getErr(); 693 SMLoc SavedErrLoc = getErrLoc(); 694 695 size_t ReadCount; 696 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) { 697 AsmToken Token = LexToken(); 698 699 Buf[ReadCount] = Token; 700 701 if (Token.is(AsmToken::Eof)) 702 break; 703 } 704 705 SetError(SavedErrLoc, SavedErr); 706 return ReadCount; 707 } 708 709 bool AsmLexer::isAtStartOfComment(const char *Ptr) { 710 if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement) 711 return false; 712 713 StringRef CommentString = MAI.getCommentString(); 714 715 if (CommentString.size() == 1) 716 return CommentString[0] == Ptr[0]; 717 718 // Allow # preprocessor comments also be counted as comments for "##" cases 719 if (CommentString[1] == '#') 720 return CommentString[0] == Ptr[0]; 721 722 return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0; 723 } 724 725 bool AsmLexer::isAtStatementSeparator(const char *Ptr) { 726 return strncmp(Ptr, MAI.getSeparatorString(), 727 strlen(MAI.getSeparatorString())) == 0; 728 } 729 730 AsmToken AsmLexer::LexToken() { 731 TokStart = CurPtr; 732 // This always consumes at least one character. 733 int CurChar = getNextChar(); 734 735 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) { 736 // If this starts with a '#', this may be a cpp 737 // hash directive and otherwise a line comment. 738 AsmToken TokenBuf[2]; 739 MutableArrayRef<AsmToken> Buf(TokenBuf, 2); 740 size_t num = peekTokens(Buf, true); 741 // There cannot be a space preceding this 742 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) && 743 TokenBuf[1].is(AsmToken::String)) { 744 CurPtr = TokStart; // reset curPtr; 745 StringRef s = LexUntilEndOfLine(); 746 UnLex(TokenBuf[1]); 747 UnLex(TokenBuf[0]); 748 return AsmToken(AsmToken::HashDirective, s); 749 } 750 751 if (MAI.shouldAllowAdditionalComments()) 752 return LexLineComment(); 753 } 754 755 if (isAtStartOfComment(TokStart)) 756 return LexLineComment(); 757 758 if (isAtStatementSeparator(TokStart)) { 759 CurPtr += strlen(MAI.getSeparatorString()) - 1; 760 IsAtStartOfLine = true; 761 IsAtStartOfStatement = true; 762 return AsmToken(AsmToken::EndOfStatement, 763 StringRef(TokStart, strlen(MAI.getSeparatorString()))); 764 } 765 766 // If we're missing a newline at EOF, make sure we still get an 767 // EndOfStatement token before the Eof token. 768 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) { 769 IsAtStartOfLine = true; 770 IsAtStartOfStatement = true; 771 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0)); 772 } 773 IsAtStartOfLine = false; 774 bool OldIsAtStartOfStatement = IsAtStartOfStatement; 775 IsAtStartOfStatement = false; 776 switch (CurChar) { 777 default: 778 // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]* 779 // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of 780 // an identifier is target-dependent. These characters are handled in the 781 // respective switch cases. 782 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') 783 return LexIdentifier(); 784 785 // Unknown character, emit an error. 786 return ReturnError(TokStart, "invalid character in input"); 787 case EOF: 788 if (EndStatementAtEOF) { 789 IsAtStartOfLine = true; 790 IsAtStartOfStatement = true; 791 } 792 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 793 case 0: 794 case ' ': 795 case '\t': 796 IsAtStartOfStatement = OldIsAtStartOfStatement; 797 while (*CurPtr == ' ' || *CurPtr == '\t') 798 CurPtr++; 799 if (SkipSpace) 800 return LexToken(); // Ignore whitespace. 801 else 802 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart)); 803 case '\r': { 804 IsAtStartOfLine = true; 805 IsAtStartOfStatement = true; 806 // If this is a CR followed by LF, treat that as one token. 807 if (CurPtr != CurBuf.end() && *CurPtr == '\n') 808 ++CurPtr; 809 return AsmToken(AsmToken::EndOfStatement, 810 StringRef(TokStart, CurPtr - TokStart)); 811 } 812 case '\n': 813 IsAtStartOfLine = true; 814 IsAtStartOfStatement = true; 815 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 816 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 817 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 818 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 819 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 820 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 821 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 822 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 823 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 824 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 825 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 826 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 827 case '$': { 828 if (LexMotorolaIntegers && isHexDigit(*CurPtr)) 829 return LexDigit(); 830 if (MAI.doesAllowDollarAtStartOfIdentifier()) 831 return LexIdentifier(); 832 return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 833 } 834 case '@': 835 if (MAI.doesAllowAtAtStartOfIdentifier()) 836 return LexIdentifier(); 837 return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 838 case '#': 839 if (MAI.doesAllowHashAtStartOfIdentifier()) 840 return LexIdentifier(); 841 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 842 case '?': 843 if (MAI.doesAllowQuestionAtStartOfIdentifier()) 844 return LexIdentifier(); 845 return AsmToken(AsmToken::Question, StringRef(TokStart, 1)); 846 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); 847 case '=': 848 if (*CurPtr == '=') { 849 ++CurPtr; 850 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 851 } 852 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 853 case '-': 854 if (*CurPtr == '>') { 855 ++CurPtr; 856 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2)); 857 } 858 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 859 case '|': 860 if (*CurPtr == '|') { 861 ++CurPtr; 862 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 863 } 864 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 865 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 866 case '&': 867 if (*CurPtr == '&') { 868 ++CurPtr; 869 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 870 } 871 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 872 case '!': 873 if (*CurPtr == '=') { 874 ++CurPtr; 875 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 876 } 877 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 878 case '%': 879 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) { 880 return LexDigit(); 881 } 882 883 if (MAI.hasMipsExpressions()) { 884 AsmToken::TokenKind Operator; 885 unsigned OperatorLength; 886 887 std::tie(Operator, OperatorLength) = 888 StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>( 889 StringRef(CurPtr)) 890 .StartsWith("call16", {AsmToken::PercentCall16, 7}) 891 .StartsWith("call_hi", {AsmToken::PercentCall_Hi, 8}) 892 .StartsWith("call_lo", {AsmToken::PercentCall_Lo, 8}) 893 .StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi, 10}) 894 .StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo, 10}) 895 .StartsWith("got_disp", {AsmToken::PercentGot_Disp, 9}) 896 .StartsWith("got_hi", {AsmToken::PercentGot_Hi, 7}) 897 .StartsWith("got_lo", {AsmToken::PercentGot_Lo, 7}) 898 .StartsWith("got_ofst", {AsmToken::PercentGot_Ofst, 9}) 899 .StartsWith("got_page", {AsmToken::PercentGot_Page, 9}) 900 .StartsWith("gottprel", {AsmToken::PercentGottprel, 9}) 901 .StartsWith("got", {AsmToken::PercentGot, 4}) 902 .StartsWith("gp_rel", {AsmToken::PercentGp_Rel, 7}) 903 .StartsWith("higher", {AsmToken::PercentHigher, 7}) 904 .StartsWith("highest", {AsmToken::PercentHighest, 8}) 905 .StartsWith("hi", {AsmToken::PercentHi, 3}) 906 .StartsWith("lo", {AsmToken::PercentLo, 3}) 907 .StartsWith("neg", {AsmToken::PercentNeg, 4}) 908 .StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi, 9}) 909 .StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo, 9}) 910 .StartsWith("tlsgd", {AsmToken::PercentTlsgd, 6}) 911 .StartsWith("tlsldm", {AsmToken::PercentTlsldm, 7}) 912 .StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi, 9}) 913 .StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo, 9}) 914 .Default({AsmToken::Percent, 1}); 915 916 if (Operator != AsmToken::Percent) { 917 CurPtr += OperatorLength - 1; 918 return AsmToken(Operator, StringRef(TokStart, OperatorLength)); 919 } 920 } 921 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 922 case '/': 923 IsAtStartOfStatement = OldIsAtStartOfStatement; 924 return LexSlash(); 925 case '\'': return LexSingleQuote(); 926 case '"': return LexQuote(); 927 case '0': case '1': case '2': case '3': case '4': 928 case '5': case '6': case '7': case '8': case '9': 929 return LexDigit(); 930 case '<': 931 switch (*CurPtr) { 932 case '<': 933 ++CurPtr; 934 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2)); 935 case '=': 936 ++CurPtr; 937 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2)); 938 case '>': 939 ++CurPtr; 940 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2)); 941 default: 942 return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 943 } 944 case '>': 945 switch (*CurPtr) { 946 case '>': 947 ++CurPtr; 948 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2)); 949 case '=': 950 ++CurPtr; 951 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2)); 952 default: 953 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 954 } 955 956 // TODO: Quoted identifiers (objc methods etc) 957 // local labels: [0-9][:] 958 // Forward/backward labels: [0-9][fb] 959 // Integers, fp constants, character constants. 960 } 961 } 962