1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This class implements the lexer for assembly files. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/MC/MCParser/AsmLexer.h" 14 #include "llvm/ADT/APInt.h" 15 #include "llvm/ADT/ArrayRef.h" 16 #include "llvm/ADT/StringExtras.h" 17 #include "llvm/ADT/StringRef.h" 18 #include "llvm/MC/MCAsmInfo.h" 19 #include "llvm/MC/MCParser/AsmLexer.h" 20 #include "llvm/Support/Compiler.h" 21 #include "llvm/Support/SMLoc.h" 22 #include "llvm/Support/SaveAndRestore.h" 23 #include "llvm/Support/raw_ostream.h" 24 #include <cassert> 25 #include <cctype> 26 #include <cstdio> 27 #include <cstring> 28 #include <string> 29 30 using namespace llvm; 31 32 SMLoc AsmToken::getLoc() const { return SMLoc::getFromPointer(Str.data()); } 33 34 SMLoc AsmToken::getEndLoc() const { 35 return SMLoc::getFromPointer(Str.data() + Str.size()); 36 } 37 38 SMRange AsmToken::getLocRange() const { return SMRange(getLoc(), getEndLoc()); } 39 40 void AsmToken::dump(raw_ostream &OS) const { 41 switch (Kind) { 42 case AsmToken::Error: 43 OS << "error"; 44 break; 45 case AsmToken::Identifier: 46 OS << "identifier: " << getString(); 47 break; 48 case AsmToken::Integer: 49 OS << "int: " << getString(); 50 break; 51 case AsmToken::Real: 52 OS << "real: " << getString(); 53 break; 54 case AsmToken::String: 55 OS << "string: " << getString(); 56 break; 57 58 // clang-format off 59 case AsmToken::Amp: OS << "Amp"; break; 60 case AsmToken::AmpAmp: OS << "AmpAmp"; break; 61 case AsmToken::At: OS << "At"; break; 62 case AsmToken::BackSlash: OS << "BackSlash"; break; 63 case AsmToken::BigNum: OS << "BigNum"; break; 64 case AsmToken::Caret: OS << "Caret"; break; 65 case AsmToken::Colon: OS << "Colon"; break; 66 case AsmToken::Comma: OS << "Comma"; break; 67 case AsmToken::Comment: OS << "Comment"; break; 68 case AsmToken::Dollar: OS << "Dollar"; break; 69 case AsmToken::Dot: OS << "Dot"; break; 70 case AsmToken::EndOfStatement: OS << "EndOfStatement"; break; 71 case AsmToken::Eof: OS << "Eof"; break; 72 case AsmToken::Equal: OS << "Equal"; break; 73 case AsmToken::EqualEqual: OS << "EqualEqual"; break; 74 case AsmToken::Exclaim: OS << "Exclaim"; break; 75 case AsmToken::ExclaimEqual: OS << "ExclaimEqual"; break; 76 case AsmToken::Greater: OS << "Greater"; break; 77 case AsmToken::GreaterEqual: OS << "GreaterEqual"; break; 78 case AsmToken::GreaterGreater: OS << "GreaterGreater"; break; 79 case AsmToken::Hash: OS << "Hash"; break; 80 case AsmToken::HashDirective: OS << "HashDirective"; break; 81 case AsmToken::LBrac: OS << "LBrac"; break; 82 case AsmToken::LCurly: OS << "LCurly"; break; 83 case AsmToken::LParen: OS << "LParen"; break; 84 case AsmToken::Less: OS << "Less"; break; 85 case AsmToken::LessEqual: OS << "LessEqual"; break; 86 case AsmToken::LessGreater: OS << "LessGreater"; break; 87 case AsmToken::LessLess: OS << "LessLess"; break; 88 case AsmToken::Minus: OS << "Minus"; break; 89 case AsmToken::MinusGreater: OS << "MinusGreater"; break; 90 case AsmToken::Percent: OS << "Percent"; break; 91 case AsmToken::Pipe: OS << "Pipe"; break; 92 case AsmToken::PipePipe: OS << "PipePipe"; break; 93 case AsmToken::Plus: OS << "Plus"; break; 94 case AsmToken::Question: OS << "Question"; break; 95 case AsmToken::RBrac: OS << "RBrac"; break; 96 case AsmToken::RCurly: OS << "RCurly"; break; 97 case AsmToken::RParen: OS << "RParen"; break; 98 case AsmToken::Slash: OS << "Slash"; break; 99 case AsmToken::Space: OS << "Space"; break; 100 case AsmToken::Star: OS << "Star"; break; 101 case AsmToken::Tilde: OS << "Tilde"; break; 102 // clang-format on 103 } 104 105 // Print the token string. 106 OS << " (\""; 107 OS.write_escaped(getString()); 108 OS << "\")"; 109 } 110 111 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) { 112 // For COFF targets, this is true, while for ELF targets, it should be false. 113 // Currently, @specifier parsing depends on '@' being included in the token. 114 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with("@") && 115 MAI.useAtForSpecifier(); 116 LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers(); 117 118 CurTok.emplace_back(AsmToken::Space, StringRef()); 119 } 120 121 void AsmLexer::setBuffer(StringRef Buf, const char *ptr, 122 bool EndStatementAtEOF) { 123 CurBuf = Buf; 124 125 if (ptr) 126 CurPtr = ptr; 127 else 128 CurPtr = CurBuf.begin(); 129 130 TokStart = nullptr; 131 this->EndStatementAtEOF = EndStatementAtEOF; 132 } 133 134 /// ReturnError - Set the error to the specified string at the specified 135 /// location. This is defined to always return AsmToken::Error. 136 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 137 SetError(SMLoc::getFromPointer(Loc), Msg); 138 139 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc)); 140 } 141 142 int AsmLexer::getNextChar() { 143 if (CurPtr == CurBuf.end()) 144 return EOF; 145 return (unsigned char)*CurPtr++; 146 } 147 148 int AsmLexer::peekNextChar() { 149 if (CurPtr == CurBuf.end()) 150 return EOF; 151 return (unsigned char)*CurPtr; 152 } 153 154 /// The leading integral digit sequence and dot should have already been 155 /// consumed, some or all of the fractional digit sequence *can* have been 156 /// consumed. 157 AsmToken AsmLexer::LexFloatLiteral() { 158 // Skip the fractional digit sequence. 159 while (isDigit(*CurPtr)) 160 ++CurPtr; 161 162 if (*CurPtr == '-' || *CurPtr == '+') 163 return ReturnError(CurPtr, "invalid sign in float literal"); 164 165 // Check for exponent 166 if ((*CurPtr == 'e' || *CurPtr == 'E')) { 167 ++CurPtr; 168 169 if (*CurPtr == '-' || *CurPtr == '+') 170 ++CurPtr; 171 172 while (isDigit(*CurPtr)) 173 ++CurPtr; 174 } 175 176 return AsmToken(AsmToken::Real, 177 StringRef(TokStart, CurPtr - TokStart)); 178 } 179 180 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+ 181 /// while making sure there are enough actual digits around for the constant to 182 /// be valid. 183 /// 184 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed 185 /// before we get here. 186 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) { 187 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') && 188 "unexpected parse state in floating hex"); 189 bool NoFracDigits = true; 190 191 // Skip the fractional part if there is one 192 if (*CurPtr == '.') { 193 ++CurPtr; 194 195 const char *FracStart = CurPtr; 196 while (isHexDigit(*CurPtr)) 197 ++CurPtr; 198 199 NoFracDigits = CurPtr == FracStart; 200 } 201 202 if (NoIntDigits && NoFracDigits) 203 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 204 "expected at least one significand digit"); 205 206 // Make sure we do have some kind of proper exponent part 207 if (*CurPtr != 'p' && *CurPtr != 'P') 208 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 209 "expected exponent part 'p'"); 210 ++CurPtr; 211 212 if (*CurPtr == '+' || *CurPtr == '-') 213 ++CurPtr; 214 215 // N.b. exponent digits are *not* hex 216 const char *ExpStart = CurPtr; 217 while (isDigit(*CurPtr)) 218 ++CurPtr; 219 220 if (CurPtr == ExpStart) 221 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 222 "expected at least one exponent digit"); 223 224 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); 225 } 226 227 /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]* 228 static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) { 229 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' || 230 (AllowAt && C == '@') || (AllowHash && C == '#'); 231 } 232 233 AsmToken AsmLexer::LexIdentifier() { 234 // Check for floating point literals. 235 if (CurPtr[-1] == '.' && isDigit(*CurPtr)) { 236 // Disambiguate a .1243foo identifier from a floating literal. 237 while (isDigit(*CurPtr)) 238 ++CurPtr; 239 240 if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier, 241 AllowHashInIdentifier) || 242 *CurPtr == 'e' || *CurPtr == 'E') 243 return LexFloatLiteral(); 244 } 245 246 while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier)) 247 ++CurPtr; 248 249 // Handle . as a special case. 250 if (CurPtr == TokStart+1 && TokStart[0] == '.') 251 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 252 253 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 254 } 255 256 /// LexSlash: Slash: / 257 /// C-Style Comment: /* ... */ 258 /// C-style Comment: // ... 259 AsmToken AsmLexer::LexSlash() { 260 if (!MAI.shouldAllowAdditionalComments()) { 261 IsAtStartOfStatement = false; 262 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1)); 263 } 264 265 switch (*CurPtr) { 266 case '*': 267 IsAtStartOfStatement = false; 268 break; // C style comment. 269 case '/': 270 ++CurPtr; 271 return LexLineComment(); 272 default: 273 IsAtStartOfStatement = false; 274 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1)); 275 } 276 277 // C Style comment. 278 ++CurPtr; // skip the star. 279 const char *CommentTextStart = CurPtr; 280 while (CurPtr != CurBuf.end()) { 281 switch (*CurPtr++) { 282 case '*': 283 // End of the comment? 284 if (*CurPtr != '/') 285 break; 286 // If we have a CommentConsumer, notify it about the comment. 287 if (CommentConsumer) { 288 CommentConsumer->HandleComment( 289 SMLoc::getFromPointer(CommentTextStart), 290 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart)); 291 } 292 ++CurPtr; // End the */. 293 return AsmToken(AsmToken::Comment, 294 StringRef(TokStart, CurPtr - TokStart)); 295 } 296 } 297 return ReturnError(TokStart, "unterminated comment"); 298 } 299 300 /// LexLineComment: Comment: #[^\n]* 301 /// : //[^\n]* 302 AsmToken AsmLexer::LexLineComment() { 303 // Mark This as an end of statement with a body of the 304 // comment. While it would be nicer to leave this two tokens, 305 // backwards compatability with TargetParsers makes keeping this in this form 306 // better. 307 const char *CommentTextStart = CurPtr; 308 int CurChar = getNextChar(); 309 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) 310 CurChar = getNextChar(); 311 const char *NewlinePtr = CurPtr; 312 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n') 313 ++CurPtr; 314 315 // If we have a CommentConsumer, notify it about the comment. 316 if (CommentConsumer) { 317 CommentConsumer->HandleComment( 318 SMLoc::getFromPointer(CommentTextStart), 319 StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart)); 320 } 321 322 IsAtStartOfLine = true; 323 // This is a whole line comment. leave newline 324 if (IsAtStartOfStatement) 325 return AsmToken(AsmToken::EndOfStatement, 326 StringRef(TokStart, CurPtr - TokStart)); 327 IsAtStartOfStatement = true; 328 329 return AsmToken(AsmToken::EndOfStatement, 330 StringRef(TokStart, CurPtr - 1 - TokStart)); 331 } 332 333 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 334 // Skip case-insensitive ULL, UL, U, L and LL suffixes. 335 if (CurPtr[0] == 'U' || CurPtr[0] == 'u') 336 ++CurPtr; 337 if (CurPtr[0] == 'L' || CurPtr[0] == 'l') 338 ++CurPtr; 339 if (CurPtr[0] == 'L' || CurPtr[0] == 'l') 340 ++CurPtr; 341 } 342 343 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the 344 // integer as a hexadecimal, possibly with leading zeroes. 345 static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix, 346 bool LexHex) { 347 const char *FirstNonDec = nullptr; 348 const char *LookAhead = CurPtr; 349 while (true) { 350 if (isDigit(*LookAhead)) { 351 ++LookAhead; 352 } else { 353 if (!FirstNonDec) 354 FirstNonDec = LookAhead; 355 356 // Keep going if we are looking for a 'h' suffix. 357 if (LexHex && isHexDigit(*LookAhead)) 358 ++LookAhead; 359 else 360 break; 361 } 362 } 363 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H'); 364 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec; 365 if (isHex) 366 return 16; 367 return DefaultRadix; 368 } 369 370 static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) { 371 while (hexDigitValue(*CurPtr) < DefaultRadix) { 372 ++CurPtr; 373 } 374 return CurPtr; 375 } 376 377 static AsmToken intToken(StringRef Ref, APInt &Value) { 378 if (Value.isIntN(64)) 379 return AsmToken(AsmToken::Integer, Ref, Value); 380 return AsmToken(AsmToken::BigNum, Ref, Value); 381 } 382 383 static std::string radixName(unsigned Radix) { 384 switch (Radix) { 385 case 2: 386 return "binary"; 387 case 8: 388 return "octal"; 389 case 10: 390 return "decimal"; 391 case 16: 392 return "hexadecimal"; 393 default: 394 return "base-" + std::to_string(Radix); 395 } 396 } 397 398 /// LexDigit: First character is [0-9]. 399 /// Local Label: [0-9][:] 400 /// Forward/Backward Label: [0-9][fb] 401 /// Binary integer: 0b[01]+ 402 /// Octal integer: 0[0-7]+ 403 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH] 404 /// Decimal integer: [1-9][0-9]* 405 AsmToken AsmLexer::LexDigit() { 406 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY]) 407 // MASM-flavor octal integer: [0-7]+[oOqQ] 408 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT]) 409 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH] 410 if (LexMasmIntegers && isdigit(CurPtr[-1])) { 411 const char *FirstNonBinary = 412 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr; 413 const char *FirstNonDecimal = 414 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr; 415 const char *OldCurPtr = CurPtr; 416 while (isHexDigit(*CurPtr)) { 417 switch (*CurPtr) { 418 default: 419 if (!FirstNonDecimal) { 420 FirstNonDecimal = CurPtr; 421 } 422 [[fallthrough]]; 423 case '9': 424 case '8': 425 case '7': 426 case '6': 427 case '5': 428 case '4': 429 case '3': 430 case '2': 431 if (!FirstNonBinary) { 432 FirstNonBinary = CurPtr; 433 } 434 break; 435 case '1': 436 case '0': 437 break; 438 } 439 ++CurPtr; 440 } 441 if (*CurPtr == '.') { 442 // MASM float literals (other than hex floats) always contain a ".", and 443 // are always written in decimal. 444 ++CurPtr; 445 return LexFloatLiteral(); 446 } 447 448 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) { 449 ++CurPtr; 450 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); 451 } 452 453 unsigned Radix = 0; 454 if (*CurPtr == 'h' || *CurPtr == 'H') { 455 // hexadecimal number 456 ++CurPtr; 457 Radix = 16; 458 } else if (*CurPtr == 't' || *CurPtr == 'T') { 459 // decimal number 460 ++CurPtr; 461 Radix = 10; 462 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' || 463 *CurPtr == 'Q') { 464 // octal number 465 ++CurPtr; 466 Radix = 8; 467 } else if (*CurPtr == 'y' || *CurPtr == 'Y') { 468 // binary number 469 ++CurPtr; 470 Radix = 2; 471 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr && 472 DefaultRadix < 14 && 473 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) { 474 Radix = 10; 475 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr && 476 DefaultRadix < 12 && 477 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) { 478 Radix = 2; 479 } 480 481 if (Radix) { 482 StringRef Result(TokStart, CurPtr - TokStart); 483 APInt Value(128, 0, true); 484 485 if (Result.drop_back().getAsInteger(Radix, Value)) 486 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); 487 488 // MSVC accepts and ignores type suffices on integer literals. 489 SkipIgnoredIntegerSuffix(CurPtr); 490 491 return intToken(Result, Value); 492 } 493 494 // default-radix integers, or floating point numbers, fall through 495 CurPtr = OldCurPtr; 496 } 497 498 // MASM default-radix integers: [0-9a-fA-F]+ 499 // (All other integer literals have a radix specifier.) 500 if (LexMasmIntegers && UseMasmDefaultRadix) { 501 CurPtr = findLastDigit(CurPtr, 16); 502 StringRef Result(TokStart, CurPtr - TokStart); 503 504 APInt Value(128, 0, true); 505 if (Result.getAsInteger(DefaultRadix, Value)) { 506 return ReturnError(TokStart, 507 "invalid " + radixName(DefaultRadix) + " number"); 508 } 509 510 return intToken(Result, Value); 511 } 512 513 // Motorola hex integers: $[0-9a-fA-F]+ 514 if (LexMotorolaIntegers && CurPtr[-1] == '$') { 515 const char *NumStart = CurPtr; 516 while (isHexDigit(CurPtr[0])) 517 ++CurPtr; 518 519 APInt Result(128, 0); 520 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result)) 521 return ReturnError(TokStart, "invalid hexadecimal number"); 522 523 return intToken(StringRef(TokStart, CurPtr - TokStart), Result); 524 } 525 526 // Motorola binary integers: %[01]+ 527 if (LexMotorolaIntegers && CurPtr[-1] == '%') { 528 const char *NumStart = CurPtr; 529 while (*CurPtr == '0' || *CurPtr == '1') 530 ++CurPtr; 531 532 APInt Result(128, 0); 533 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result)) 534 return ReturnError(TokStart, "invalid binary number"); 535 536 return intToken(StringRef(TokStart, CurPtr - TokStart), Result); 537 } 538 539 // Decimal integer: [1-9][0-9]* 540 // HLASM-flavour decimal integer: [0-9][0-9]* 541 // FIXME: Later on, support for fb for HLASM has to be added in 542 // as they probably would be needed for asm goto 543 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') { 544 unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers); 545 546 if (!LexHLASMIntegers) { 547 bool IsHex = Radix == 16; 548 // Check for floating point literals. 549 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) { 550 if (*CurPtr == '.') 551 ++CurPtr; 552 return LexFloatLiteral(); 553 } 554 } 555 556 StringRef Result(TokStart, CurPtr - TokStart); 557 558 APInt Value(128, 0, true); 559 if (Result.getAsInteger(Radix, Value)) 560 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); 561 562 if (!LexHLASMIntegers) 563 // The darwin/x86 (and x86-64) assembler accepts and ignores type 564 // suffices on integer literals. 565 SkipIgnoredIntegerSuffix(CurPtr); 566 567 return intToken(Result, Value); 568 } 569 570 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) { 571 ++CurPtr; 572 // See if we actually have "0b" as part of something like "jmp 0b\n" 573 if (!isDigit(CurPtr[0])) { 574 --CurPtr; 575 StringRef Result(TokStart, CurPtr - TokStart); 576 return AsmToken(AsmToken::Integer, Result, 0); 577 } 578 const char *NumStart = CurPtr; 579 while (CurPtr[0] == '0' || CurPtr[0] == '1') 580 ++CurPtr; 581 582 // Requires at least one binary digit. 583 if (CurPtr == NumStart) 584 return ReturnError(TokStart, "invalid binary number"); 585 586 StringRef Result(TokStart, CurPtr - TokStart); 587 588 APInt Value(128, 0, true); 589 if (Result.substr(2).getAsInteger(2, Value)) 590 return ReturnError(TokStart, "invalid binary number"); 591 592 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 593 // suffixes on integer literals. 594 SkipIgnoredIntegerSuffix(CurPtr); 595 596 return intToken(Result, Value); 597 } 598 599 if ((*CurPtr == 'x') || (*CurPtr == 'X')) { 600 ++CurPtr; 601 const char *NumStart = CurPtr; 602 while (isHexDigit(CurPtr[0])) 603 ++CurPtr; 604 605 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be 606 // diagnosed by LexHexFloatLiteral). 607 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P') 608 return LexHexFloatLiteral(NumStart == CurPtr); 609 610 // Otherwise requires at least one hex digit. 611 if (CurPtr == NumStart) 612 return ReturnError(CurPtr-2, "invalid hexadecimal number"); 613 614 APInt Result(128, 0); 615 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 616 return ReturnError(TokStart, "invalid hexadecimal number"); 617 618 // Consume the optional [hH]. 619 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H')) 620 ++CurPtr; 621 622 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 623 // suffixes on integer literals. 624 SkipIgnoredIntegerSuffix(CurPtr); 625 626 return intToken(StringRef(TokStart, CurPtr - TokStart), Result); 627 } 628 629 // Either octal or hexadecimal. 630 APInt Value(128, 0, true); 631 unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers); 632 StringRef Result(TokStart, CurPtr - TokStart); 633 if (Result.getAsInteger(Radix, Value)) 634 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); 635 636 // Consume the [hH]. 637 if (Radix == 16) 638 ++CurPtr; 639 640 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 641 // suffixes on integer literals. 642 SkipIgnoredIntegerSuffix(CurPtr); 643 644 return intToken(Result, Value); 645 } 646 647 /// LexSingleQuote: Integer: 'b' 648 AsmToken AsmLexer::LexSingleQuote() { 649 int CurChar = getNextChar(); 650 651 if (LexHLASMStrings) 652 return ReturnError(TokStart, "invalid usage of character literals"); 653 654 if (LexMasmStrings) { 655 while (CurChar != EOF) { 656 if (CurChar != '\'') { 657 CurChar = getNextChar(); 658 } else if (peekNextChar() == '\'') { 659 // In MASM single-quote strings, doubled single-quotes mean an escaped 660 // single quote, so should be lexed in. 661 (void)getNextChar(); 662 CurChar = getNextChar(); 663 } else { 664 break; 665 } 666 } 667 if (CurChar == EOF) 668 return ReturnError(TokStart, "unterminated string constant"); 669 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 670 } 671 672 if (CurChar == '\\') 673 CurChar = getNextChar(); 674 675 if (CurChar == EOF) 676 return ReturnError(TokStart, "unterminated single quote"); 677 678 CurChar = getNextChar(); 679 680 if (CurChar != '\'') 681 return ReturnError(TokStart, "single quote way too long"); 682 683 // The idea here being that 'c' is basically just an integral 684 // constant. 685 StringRef Res = StringRef(TokStart,CurPtr - TokStart); 686 long long Value; 687 688 if (Res.starts_with("\'\\")) { 689 char theChar = Res[2]; 690 switch (theChar) { 691 default: Value = theChar; break; 692 case '\'': Value = '\''; break; 693 case 't': Value = '\t'; break; 694 case 'n': Value = '\n'; break; 695 case 'b': Value = '\b'; break; 696 case 'f': Value = '\f'; break; 697 case 'r': Value = '\r'; break; 698 } 699 } else 700 Value = TokStart[1]; 701 702 return AsmToken(AsmToken::Integer, Res, Value); 703 } 704 705 /// LexQuote: String: "..." 706 AsmToken AsmLexer::LexQuote() { 707 int CurChar = getNextChar(); 708 if (LexHLASMStrings) 709 return ReturnError(TokStart, "invalid usage of string literals"); 710 711 if (LexMasmStrings) { 712 while (CurChar != EOF) { 713 if (CurChar != '"') { 714 CurChar = getNextChar(); 715 } else if (peekNextChar() == '"') { 716 // In MASM double-quoted strings, doubled double-quotes mean an escaped 717 // double quote, so should be lexed in. 718 (void)getNextChar(); 719 CurChar = getNextChar(); 720 } else { 721 break; 722 } 723 } 724 if (CurChar == EOF) 725 return ReturnError(TokStart, "unterminated string constant"); 726 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 727 } 728 729 while (CurChar != '"') { 730 if (CurChar == '\\') { 731 // Allow \", etc. 732 CurChar = getNextChar(); 733 } 734 735 if (CurChar == EOF) 736 return ReturnError(TokStart, "unterminated string constant"); 737 738 CurChar = getNextChar(); 739 } 740 741 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 742 } 743 744 StringRef AsmLexer::LexUntilEndOfStatement() { 745 TokStart = CurPtr; 746 747 while (!isAtStartOfComment(CurPtr) && // Start of line comment. 748 !isAtStatementSeparator(CurPtr) && // End of statement marker. 749 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { 750 ++CurPtr; 751 } 752 return StringRef(TokStart, CurPtr-TokStart); 753 } 754 755 StringRef AsmLexer::LexUntilEndOfLine() { 756 TokStart = CurPtr; 757 758 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { 759 ++CurPtr; 760 } 761 return StringRef(TokStart, CurPtr-TokStart); 762 } 763 764 size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf, 765 bool ShouldSkipSpace) { 766 SaveAndRestore SavedTokenStart(TokStart); 767 SaveAndRestore SavedCurPtr(CurPtr); 768 SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine); 769 SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement); 770 SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace); 771 SaveAndRestore SavedIsPeeking(IsPeeking, true); 772 std::string SavedErr = getErr(); 773 SMLoc SavedErrLoc = getErrLoc(); 774 775 size_t ReadCount; 776 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) { 777 AsmToken Token = LexToken(); 778 779 Buf[ReadCount] = Token; 780 781 if (Token.is(AsmToken::Eof)) { 782 ReadCount++; 783 break; 784 } 785 } 786 787 SetError(SavedErrLoc, SavedErr); 788 return ReadCount; 789 } 790 791 bool AsmLexer::isAtStartOfComment(const char *Ptr) { 792 if (MAI.isHLASM() && !IsAtStartOfStatement) 793 return false; 794 795 StringRef CommentString = MAI.getCommentString(); 796 797 if (CommentString.size() == 1) 798 return CommentString[0] == Ptr[0]; 799 800 // Allow # preprocessor comments also be counted as comments for "##" cases 801 if (CommentString[1] == '#') 802 return CommentString[0] == Ptr[0]; 803 804 return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0; 805 } 806 807 bool AsmLexer::isAtStatementSeparator(const char *Ptr) { 808 return strncmp(Ptr, MAI.getSeparatorString(), 809 strlen(MAI.getSeparatorString())) == 0; 810 } 811 812 AsmToken AsmLexer::LexToken() { 813 TokStart = CurPtr; 814 // This always consumes at least one character. 815 int CurChar = getNextChar(); 816 817 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) { 818 // If this starts with a '#', this may be a cpp 819 // hash directive and otherwise a line comment. 820 AsmToken TokenBuf[2]; 821 MutableArrayRef<AsmToken> Buf(TokenBuf, 2); 822 size_t num = peekTokens(Buf, true); 823 // There cannot be a space preceding this 824 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) && 825 TokenBuf[1].is(AsmToken::String)) { 826 CurPtr = TokStart; // reset curPtr; 827 StringRef s = LexUntilEndOfLine(); 828 UnLex(TokenBuf[1]); 829 UnLex(TokenBuf[0]); 830 return AsmToken(AsmToken::HashDirective, s); 831 } 832 833 if (MAI.shouldAllowAdditionalComments()) 834 return LexLineComment(); 835 } 836 837 if (isAtStartOfComment(TokStart)) { 838 CurPtr += MAI.getCommentString().size() - 1; 839 return LexLineComment(); 840 } 841 842 if (isAtStatementSeparator(TokStart)) { 843 CurPtr += strlen(MAI.getSeparatorString()) - 1; 844 IsAtStartOfLine = true; 845 IsAtStartOfStatement = true; 846 return AsmToken(AsmToken::EndOfStatement, 847 StringRef(TokStart, strlen(MAI.getSeparatorString()))); 848 } 849 850 // If we're missing a newline at EOF, make sure we still get an 851 // EndOfStatement token before the Eof token. 852 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) { 853 IsAtStartOfLine = true; 854 IsAtStartOfStatement = true; 855 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0)); 856 } 857 IsAtStartOfLine = false; 858 bool OldIsAtStartOfStatement = IsAtStartOfStatement; 859 IsAtStartOfStatement = false; 860 switch (CurChar) { 861 default: 862 // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]* 863 // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of 864 // an identifier is target-dependent. These characters are handled in the 865 // respective switch cases. 866 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') 867 return LexIdentifier(); 868 869 // Unknown character, emit an error. 870 return ReturnError(TokStart, "invalid character in input"); 871 case EOF: 872 if (EndStatementAtEOF) { 873 IsAtStartOfLine = true; 874 IsAtStartOfStatement = true; 875 } 876 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 877 case 0: 878 case ' ': 879 case '\t': 880 IsAtStartOfStatement = OldIsAtStartOfStatement; 881 while (*CurPtr == ' ' || *CurPtr == '\t') 882 CurPtr++; 883 if (SkipSpace) 884 return LexToken(); // Ignore whitespace. 885 else 886 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart)); 887 case '\r': { 888 IsAtStartOfLine = true; 889 IsAtStartOfStatement = true; 890 // If this is a CR followed by LF, treat that as one token. 891 if (CurPtr != CurBuf.end() && *CurPtr == '\n') 892 ++CurPtr; 893 return AsmToken(AsmToken::EndOfStatement, 894 StringRef(TokStart, CurPtr - TokStart)); 895 } 896 case '\n': 897 IsAtStartOfLine = true; 898 IsAtStartOfStatement = true; 899 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 900 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 901 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 902 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 903 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 904 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 905 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 906 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 907 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 908 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 909 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 910 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 911 case '$': { 912 if (LexMotorolaIntegers && isHexDigit(*CurPtr)) 913 return LexDigit(); 914 if (MAI.doesAllowDollarAtStartOfIdentifier()) 915 return LexIdentifier(); 916 return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 917 } 918 case '@': 919 if (MAI.doesAllowAtAtStartOfIdentifier()) 920 return LexIdentifier(); 921 return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 922 case '#': 923 if (MAI.isHLASM()) 924 return LexIdentifier(); 925 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 926 case '?': 927 if (MAI.doesAllowQuestionAtStartOfIdentifier()) 928 return LexIdentifier(); 929 return AsmToken(AsmToken::Question, StringRef(TokStart, 1)); 930 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); 931 case '=': 932 if (*CurPtr == '=') { 933 ++CurPtr; 934 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 935 } 936 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 937 case '-': 938 if (*CurPtr == '>') { 939 ++CurPtr; 940 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2)); 941 } 942 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 943 case '|': 944 if (*CurPtr == '|') { 945 ++CurPtr; 946 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 947 } 948 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 949 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 950 case '&': 951 if (*CurPtr == '&') { 952 ++CurPtr; 953 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 954 } 955 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 956 case '!': 957 if (*CurPtr == '=') { 958 ++CurPtr; 959 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 960 } 961 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 962 case '%': 963 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) { 964 return LexDigit(); 965 } 966 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 967 case '/': 968 IsAtStartOfStatement = OldIsAtStartOfStatement; 969 return LexSlash(); 970 case '\'': return LexSingleQuote(); 971 case '"': return LexQuote(); 972 case '0': case '1': case '2': case '3': case '4': 973 case '5': case '6': case '7': case '8': case '9': 974 return LexDigit(); 975 case '<': 976 switch (*CurPtr) { 977 case '<': 978 ++CurPtr; 979 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2)); 980 case '=': 981 ++CurPtr; 982 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2)); 983 case '>': 984 ++CurPtr; 985 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2)); 986 default: 987 return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 988 } 989 case '>': 990 switch (*CurPtr) { 991 case '>': 992 ++CurPtr; 993 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2)); 994 case '=': 995 ++CurPtr; 996 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2)); 997 default: 998 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 999 } 1000 1001 // TODO: Quoted identifiers (objc methods etc) 1002 // local labels: [0-9][:] 1003 // Forward/backward labels: [0-9][fb] 1004 // Integers, fp constants, character constants. 1005 } 1006 } 1007