1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class implements the lexer for assembly files.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/MC/MCParser/AsmLexer.h"
14 #include "llvm/ADT/APInt.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/MC/MCAsmInfo.h"
19 #include "llvm/MC/MCParser/AsmLexer.h"
20 #include "llvm/Support/Compiler.h"
21 #include "llvm/Support/SMLoc.h"
22 #include "llvm/Support/SaveAndRestore.h"
23 #include "llvm/Support/raw_ostream.h"
24 #include <cassert>
25 #include <cctype>
26 #include <cstdio>
27 #include <cstring>
28 #include <string>
29
30 using namespace llvm;
31
getLoc() const32 SMLoc AsmToken::getLoc() const { return SMLoc::getFromPointer(Str.data()); }
33
getEndLoc() const34 SMLoc AsmToken::getEndLoc() const {
35 return SMLoc::getFromPointer(Str.data() + Str.size());
36 }
37
getLocRange() const38 SMRange AsmToken::getLocRange() const { return SMRange(getLoc(), getEndLoc()); }
39
dump(raw_ostream & OS) const40 void AsmToken::dump(raw_ostream &OS) const {
41 switch (Kind) {
42 case AsmToken::Error:
43 OS << "error";
44 break;
45 case AsmToken::Identifier:
46 OS << "identifier: " << getString();
47 break;
48 case AsmToken::Integer:
49 OS << "int: " << getString();
50 break;
51 case AsmToken::Real:
52 OS << "real: " << getString();
53 break;
54 case AsmToken::String:
55 OS << "string: " << getString();
56 break;
57
58 // clang-format off
59 case AsmToken::Amp: OS << "Amp"; break;
60 case AsmToken::AmpAmp: OS << "AmpAmp"; break;
61 case AsmToken::At: OS << "At"; break;
62 case AsmToken::BackSlash: OS << "BackSlash"; break;
63 case AsmToken::BigNum: OS << "BigNum"; break;
64 case AsmToken::Caret: OS << "Caret"; break;
65 case AsmToken::Colon: OS << "Colon"; break;
66 case AsmToken::Comma: OS << "Comma"; break;
67 case AsmToken::Comment: OS << "Comment"; break;
68 case AsmToken::Dollar: OS << "Dollar"; break;
69 case AsmToken::Dot: OS << "Dot"; break;
70 case AsmToken::EndOfStatement: OS << "EndOfStatement"; break;
71 case AsmToken::Eof: OS << "Eof"; break;
72 case AsmToken::Equal: OS << "Equal"; break;
73 case AsmToken::EqualEqual: OS << "EqualEqual"; break;
74 case AsmToken::Exclaim: OS << "Exclaim"; break;
75 case AsmToken::ExclaimEqual: OS << "ExclaimEqual"; break;
76 case AsmToken::Greater: OS << "Greater"; break;
77 case AsmToken::GreaterEqual: OS << "GreaterEqual"; break;
78 case AsmToken::GreaterGreater: OS << "GreaterGreater"; break;
79 case AsmToken::Hash: OS << "Hash"; break;
80 case AsmToken::HashDirective: OS << "HashDirective"; break;
81 case AsmToken::LBrac: OS << "LBrac"; break;
82 case AsmToken::LCurly: OS << "LCurly"; break;
83 case AsmToken::LParen: OS << "LParen"; break;
84 case AsmToken::Less: OS << "Less"; break;
85 case AsmToken::LessEqual: OS << "LessEqual"; break;
86 case AsmToken::LessGreater: OS << "LessGreater"; break;
87 case AsmToken::LessLess: OS << "LessLess"; break;
88 case AsmToken::Minus: OS << "Minus"; break;
89 case AsmToken::MinusGreater: OS << "MinusGreater"; break;
90 case AsmToken::Percent: OS << "Percent"; break;
91 case AsmToken::Pipe: OS << "Pipe"; break;
92 case AsmToken::PipePipe: OS << "PipePipe"; break;
93 case AsmToken::Plus: OS << "Plus"; break;
94 case AsmToken::Question: OS << "Question"; break;
95 case AsmToken::RBrac: OS << "RBrac"; break;
96 case AsmToken::RCurly: OS << "RCurly"; break;
97 case AsmToken::RParen: OS << "RParen"; break;
98 case AsmToken::Slash: OS << "Slash"; break;
99 case AsmToken::Space: OS << "Space"; break;
100 case AsmToken::Star: OS << "Star"; break;
101 case AsmToken::Tilde: OS << "Tilde"; break;
102 // clang-format on
103 }
104
105 // Print the token string.
106 OS << " (\"";
107 OS.write_escaped(getString());
108 OS << "\")";
109 }
110
AsmLexer(const MCAsmInfo & MAI)111 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
112 // For COFF targets, this is true, while for ELF targets, it should be false.
113 // Currently, @specifier parsing depends on '@' being included in the token.
114 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with("@") &&
115 MAI.useAtForSpecifier();
116 LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
117
118 CurTok.emplace_back(AsmToken::Space, StringRef());
119 }
120
setBuffer(StringRef Buf,const char * ptr,bool EndStatementAtEOF)121 void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
122 bool EndStatementAtEOF) {
123 CurBuf = Buf;
124
125 if (ptr)
126 CurPtr = ptr;
127 else
128 CurPtr = CurBuf.begin();
129
130 TokStart = nullptr;
131 this->EndStatementAtEOF = EndStatementAtEOF;
132 }
133
134 /// ReturnError - Set the error to the specified string at the specified
135 /// location. This is defined to always return AsmToken::Error.
ReturnError(const char * Loc,const std::string & Msg)136 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
137 SetError(SMLoc::getFromPointer(Loc), Msg);
138
139 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
140 }
141
getNextChar()142 int AsmLexer::getNextChar() {
143 if (CurPtr == CurBuf.end())
144 return EOF;
145 return (unsigned char)*CurPtr++;
146 }
147
peekNextChar()148 int AsmLexer::peekNextChar() {
149 if (CurPtr == CurBuf.end())
150 return EOF;
151 return (unsigned char)*CurPtr;
152 }
153
154 /// The leading integral digit sequence and dot should have already been
155 /// consumed, some or all of the fractional digit sequence *can* have been
156 /// consumed.
LexFloatLiteral()157 AsmToken AsmLexer::LexFloatLiteral() {
158 // Skip the fractional digit sequence.
159 while (isDigit(*CurPtr))
160 ++CurPtr;
161
162 if (*CurPtr == '-' || *CurPtr == '+')
163 return ReturnError(CurPtr, "invalid sign in float literal");
164
165 // Check for exponent
166 if ((*CurPtr == 'e' || *CurPtr == 'E')) {
167 ++CurPtr;
168
169 if (*CurPtr == '-' || *CurPtr == '+')
170 ++CurPtr;
171
172 while (isDigit(*CurPtr))
173 ++CurPtr;
174 }
175
176 return AsmToken(AsmToken::Real,
177 StringRef(TokStart, CurPtr - TokStart));
178 }
179
180 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
181 /// while making sure there are enough actual digits around for the constant to
182 /// be valid.
183 ///
184 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
185 /// before we get here.
LexHexFloatLiteral(bool NoIntDigits)186 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
187 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
188 "unexpected parse state in floating hex");
189 bool NoFracDigits = true;
190
191 // Skip the fractional part if there is one
192 if (*CurPtr == '.') {
193 ++CurPtr;
194
195 const char *FracStart = CurPtr;
196 while (isHexDigit(*CurPtr))
197 ++CurPtr;
198
199 NoFracDigits = CurPtr == FracStart;
200 }
201
202 if (NoIntDigits && NoFracDigits)
203 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
204 "expected at least one significand digit");
205
206 // Make sure we do have some kind of proper exponent part
207 if (*CurPtr != 'p' && *CurPtr != 'P')
208 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
209 "expected exponent part 'p'");
210 ++CurPtr;
211
212 if (*CurPtr == '+' || *CurPtr == '-')
213 ++CurPtr;
214
215 // N.b. exponent digits are *not* hex
216 const char *ExpStart = CurPtr;
217 while (isDigit(*CurPtr))
218 ++CurPtr;
219
220 if (CurPtr == ExpStart)
221 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
222 "expected at least one exponent digit");
223
224 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
225 }
226
227 /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
isIdentifierChar(char C,bool AllowAt,bool AllowHash)228 static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
229 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
230 (AllowAt && C == '@') || (AllowHash && C == '#');
231 }
232
LexIdentifier()233 AsmToken AsmLexer::LexIdentifier() {
234 // Check for floating point literals.
235 if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
236 // Disambiguate a .1243foo identifier from a floating literal.
237 while (isDigit(*CurPtr))
238 ++CurPtr;
239
240 if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
241 AllowHashInIdentifier) ||
242 *CurPtr == 'e' || *CurPtr == 'E')
243 return LexFloatLiteral();
244 }
245
246 while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
247 ++CurPtr;
248
249 // Handle . as a special case.
250 if (CurPtr == TokStart+1 && TokStart[0] == '.')
251 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
252
253 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
254 }
255
256 /// LexSlash: Slash: /
257 /// C-Style Comment: /* ... */
258 /// C-style Comment: // ...
LexSlash()259 AsmToken AsmLexer::LexSlash() {
260 if (!MAI.shouldAllowAdditionalComments()) {
261 IsAtStartOfStatement = false;
262 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
263 }
264
265 switch (*CurPtr) {
266 case '*':
267 IsAtStartOfStatement = false;
268 break; // C style comment.
269 case '/':
270 ++CurPtr;
271 return LexLineComment();
272 default:
273 IsAtStartOfStatement = false;
274 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
275 }
276
277 // C Style comment.
278 ++CurPtr; // skip the star.
279 const char *CommentTextStart = CurPtr;
280 while (CurPtr != CurBuf.end()) {
281 switch (*CurPtr++) {
282 case '*':
283 // End of the comment?
284 if (*CurPtr != '/')
285 break;
286 // If we have a CommentConsumer, notify it about the comment.
287 if (CommentConsumer) {
288 CommentConsumer->HandleComment(
289 SMLoc::getFromPointer(CommentTextStart),
290 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
291 }
292 ++CurPtr; // End the */.
293 return AsmToken(AsmToken::Comment,
294 StringRef(TokStart, CurPtr - TokStart));
295 }
296 }
297 return ReturnError(TokStart, "unterminated comment");
298 }
299
300 /// LexLineComment: Comment: #[^\n]*
301 /// : //[^\n]*
LexLineComment()302 AsmToken AsmLexer::LexLineComment() {
303 // Mark This as an end of statement with a body of the
304 // comment. While it would be nicer to leave this two tokens,
305 // backwards compatability with TargetParsers makes keeping this in this form
306 // better.
307 const char *CommentTextStart = CurPtr;
308 int CurChar = getNextChar();
309 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
310 CurChar = getNextChar();
311 const char *NewlinePtr = CurPtr;
312 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
313 ++CurPtr;
314
315 // If we have a CommentConsumer, notify it about the comment.
316 if (CommentConsumer) {
317 CommentConsumer->HandleComment(
318 SMLoc::getFromPointer(CommentTextStart),
319 StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
320 }
321
322 IsAtStartOfLine = true;
323 // This is a whole line comment. leave newline
324 if (IsAtStartOfStatement)
325 return AsmToken(AsmToken::EndOfStatement,
326 StringRef(TokStart, CurPtr - TokStart));
327 IsAtStartOfStatement = true;
328
329 return AsmToken(AsmToken::EndOfStatement,
330 StringRef(TokStart, CurPtr - 1 - TokStart));
331 }
332
SkipIgnoredIntegerSuffix(const char * & CurPtr)333 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
334 // Skip case-insensitive ULL, UL, U, L and LL suffixes.
335 if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
336 ++CurPtr;
337 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
338 ++CurPtr;
339 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
340 ++CurPtr;
341 }
342
343 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
344 // integer as a hexadecimal, possibly with leading zeroes.
doHexLookAhead(const char * & CurPtr,unsigned DefaultRadix,bool LexHex)345 static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
346 bool LexHex) {
347 const char *FirstNonDec = nullptr;
348 const char *LookAhead = CurPtr;
349 while (true) {
350 if (isDigit(*LookAhead)) {
351 ++LookAhead;
352 } else {
353 if (!FirstNonDec)
354 FirstNonDec = LookAhead;
355
356 // Keep going if we are looking for a 'h' suffix.
357 if (LexHex && isHexDigit(*LookAhead))
358 ++LookAhead;
359 else
360 break;
361 }
362 }
363 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
364 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
365 if (isHex)
366 return 16;
367 return DefaultRadix;
368 }
369
findLastDigit(const char * CurPtr,unsigned DefaultRadix)370 static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
371 while (hexDigitValue(*CurPtr) < DefaultRadix) {
372 ++CurPtr;
373 }
374 return CurPtr;
375 }
376
intToken(StringRef Ref,APInt & Value)377 static AsmToken intToken(StringRef Ref, APInt &Value) {
378 if (Value.isIntN(64))
379 return AsmToken(AsmToken::Integer, Ref, Value);
380 return AsmToken(AsmToken::BigNum, Ref, Value);
381 }
382
radixName(unsigned Radix)383 static std::string radixName(unsigned Radix) {
384 switch (Radix) {
385 case 2:
386 return "binary";
387 case 8:
388 return "octal";
389 case 10:
390 return "decimal";
391 case 16:
392 return "hexadecimal";
393 default:
394 return "base-" + std::to_string(Radix);
395 }
396 }
397
398 /// LexDigit: First character is [0-9].
399 /// Local Label: [0-9][:]
400 /// Forward/Backward Label: [0-9][fb]
401 /// Binary integer: 0b[01]+
402 /// Octal integer: 0[0-7]+
403 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
404 /// Decimal integer: [1-9][0-9]*
LexDigit()405 AsmToken AsmLexer::LexDigit() {
406 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
407 // MASM-flavor octal integer: [0-7]+[oOqQ]
408 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
409 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
410 if (LexMasmIntegers && isdigit(CurPtr[-1])) {
411 const char *FirstNonBinary =
412 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
413 const char *FirstNonDecimal =
414 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
415 const char *OldCurPtr = CurPtr;
416 while (isHexDigit(*CurPtr)) {
417 switch (*CurPtr) {
418 default:
419 if (!FirstNonDecimal) {
420 FirstNonDecimal = CurPtr;
421 }
422 [[fallthrough]];
423 case '9':
424 case '8':
425 case '7':
426 case '6':
427 case '5':
428 case '4':
429 case '3':
430 case '2':
431 if (!FirstNonBinary) {
432 FirstNonBinary = CurPtr;
433 }
434 break;
435 case '1':
436 case '0':
437 break;
438 }
439 ++CurPtr;
440 }
441 if (*CurPtr == '.') {
442 // MASM float literals (other than hex floats) always contain a ".", and
443 // are always written in decimal.
444 ++CurPtr;
445 return LexFloatLiteral();
446 }
447
448 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
449 ++CurPtr;
450 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
451 }
452
453 unsigned Radix = 0;
454 if (*CurPtr == 'h' || *CurPtr == 'H') {
455 // hexadecimal number
456 ++CurPtr;
457 Radix = 16;
458 } else if (*CurPtr == 't' || *CurPtr == 'T') {
459 // decimal number
460 ++CurPtr;
461 Radix = 10;
462 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
463 *CurPtr == 'Q') {
464 // octal number
465 ++CurPtr;
466 Radix = 8;
467 } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
468 // binary number
469 ++CurPtr;
470 Radix = 2;
471 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
472 DefaultRadix < 14 &&
473 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
474 Radix = 10;
475 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
476 DefaultRadix < 12 &&
477 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
478 Radix = 2;
479 }
480
481 if (Radix) {
482 StringRef Result(TokStart, CurPtr - TokStart);
483 APInt Value(128, 0, true);
484
485 if (Result.drop_back().getAsInteger(Radix, Value))
486 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
487
488 // MSVC accepts and ignores type suffices on integer literals.
489 SkipIgnoredIntegerSuffix(CurPtr);
490
491 return intToken(Result, Value);
492 }
493
494 // default-radix integers, or floating point numbers, fall through
495 CurPtr = OldCurPtr;
496 }
497
498 // MASM default-radix integers: [0-9a-fA-F]+
499 // (All other integer literals have a radix specifier.)
500 if (LexMasmIntegers && UseMasmDefaultRadix) {
501 CurPtr = findLastDigit(CurPtr, 16);
502 StringRef Result(TokStart, CurPtr - TokStart);
503
504 APInt Value(128, 0, true);
505 if (Result.getAsInteger(DefaultRadix, Value)) {
506 return ReturnError(TokStart,
507 "invalid " + radixName(DefaultRadix) + " number");
508 }
509
510 return intToken(Result, Value);
511 }
512
513 // Motorola hex integers: $[0-9a-fA-F]+
514 if (LexMotorolaIntegers && CurPtr[-1] == '$') {
515 const char *NumStart = CurPtr;
516 while (isHexDigit(CurPtr[0]))
517 ++CurPtr;
518
519 APInt Result(128, 0);
520 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result))
521 return ReturnError(TokStart, "invalid hexadecimal number");
522
523 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
524 }
525
526 // Motorola binary integers: %[01]+
527 if (LexMotorolaIntegers && CurPtr[-1] == '%') {
528 const char *NumStart = CurPtr;
529 while (*CurPtr == '0' || *CurPtr == '1')
530 ++CurPtr;
531
532 APInt Result(128, 0);
533 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result))
534 return ReturnError(TokStart, "invalid binary number");
535
536 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
537 }
538
539 // Decimal integer: [1-9][0-9]*
540 // HLASM-flavour decimal integer: [0-9][0-9]*
541 // FIXME: Later on, support for fb for HLASM has to be added in
542 // as they probably would be needed for asm goto
543 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
544 unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
545
546 if (!LexHLASMIntegers) {
547 bool IsHex = Radix == 16;
548 // Check for floating point literals.
549 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
550 if (*CurPtr == '.')
551 ++CurPtr;
552 return LexFloatLiteral();
553 }
554 }
555
556 StringRef Result(TokStart, CurPtr - TokStart);
557
558 APInt Value(128, 0, true);
559 if (Result.getAsInteger(Radix, Value))
560 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
561
562 if (!LexHLASMIntegers)
563 // The darwin/x86 (and x86-64) assembler accepts and ignores type
564 // suffices on integer literals.
565 SkipIgnoredIntegerSuffix(CurPtr);
566
567 return intToken(Result, Value);
568 }
569
570 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
571 ++CurPtr;
572 // See if we actually have "0b" as part of something like "jmp 0b\n"
573 if (!isDigit(CurPtr[0])) {
574 --CurPtr;
575 StringRef Result(TokStart, CurPtr - TokStart);
576 return AsmToken(AsmToken::Integer, Result, 0);
577 }
578 const char *NumStart = CurPtr;
579 while (CurPtr[0] == '0' || CurPtr[0] == '1')
580 ++CurPtr;
581
582 // Requires at least one binary digit.
583 if (CurPtr == NumStart)
584 return ReturnError(TokStart, "invalid binary number");
585
586 StringRef Result(TokStart, CurPtr - TokStart);
587
588 APInt Value(128, 0, true);
589 if (Result.substr(2).getAsInteger(2, Value))
590 return ReturnError(TokStart, "invalid binary number");
591
592 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
593 // suffixes on integer literals.
594 SkipIgnoredIntegerSuffix(CurPtr);
595
596 return intToken(Result, Value);
597 }
598
599 if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
600 ++CurPtr;
601 const char *NumStart = CurPtr;
602 while (isHexDigit(CurPtr[0]))
603 ++CurPtr;
604
605 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
606 // diagnosed by LexHexFloatLiteral).
607 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
608 return LexHexFloatLiteral(NumStart == CurPtr);
609
610 // Otherwise requires at least one hex digit.
611 if (CurPtr == NumStart)
612 return ReturnError(CurPtr-2, "invalid hexadecimal number");
613
614 APInt Result(128, 0);
615 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
616 return ReturnError(TokStart, "invalid hexadecimal number");
617
618 // Consume the optional [hH].
619 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
620 ++CurPtr;
621
622 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
623 // suffixes on integer literals.
624 SkipIgnoredIntegerSuffix(CurPtr);
625
626 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
627 }
628
629 // Either octal or hexadecimal.
630 APInt Value(128, 0, true);
631 unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
632 StringRef Result(TokStart, CurPtr - TokStart);
633 if (Result.getAsInteger(Radix, Value))
634 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
635
636 // Consume the [hH].
637 if (Radix == 16)
638 ++CurPtr;
639
640 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
641 // suffixes on integer literals.
642 SkipIgnoredIntegerSuffix(CurPtr);
643
644 return intToken(Result, Value);
645 }
646
647 /// LexSingleQuote: Integer: 'b'
LexSingleQuote()648 AsmToken AsmLexer::LexSingleQuote() {
649 int CurChar = getNextChar();
650
651 if (LexHLASMStrings)
652 return ReturnError(TokStart, "invalid usage of character literals");
653
654 if (LexMasmStrings) {
655 while (CurChar != EOF) {
656 if (CurChar != '\'') {
657 CurChar = getNextChar();
658 } else if (peekNextChar() == '\'') {
659 // In MASM single-quote strings, doubled single-quotes mean an escaped
660 // single quote, so should be lexed in.
661 (void)getNextChar();
662 CurChar = getNextChar();
663 } else {
664 break;
665 }
666 }
667 if (CurChar == EOF)
668 return ReturnError(TokStart, "unterminated string constant");
669 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
670 }
671
672 if (CurChar == '\\')
673 CurChar = getNextChar();
674
675 if (CurChar == EOF)
676 return ReturnError(TokStart, "unterminated single quote");
677
678 CurChar = getNextChar();
679
680 if (CurChar != '\'')
681 return ReturnError(TokStart, "single quote way too long");
682
683 // The idea here being that 'c' is basically just an integral
684 // constant.
685 StringRef Res = StringRef(TokStart,CurPtr - TokStart);
686 long long Value;
687
688 if (Res.starts_with("\'\\")) {
689 char theChar = Res[2];
690 switch (theChar) {
691 default: Value = theChar; break;
692 case '\'': Value = '\''; break;
693 case 't': Value = '\t'; break;
694 case 'n': Value = '\n'; break;
695 case 'b': Value = '\b'; break;
696 case 'f': Value = '\f'; break;
697 case 'r': Value = '\r'; break;
698 }
699 } else
700 Value = TokStart[1];
701
702 return AsmToken(AsmToken::Integer, Res, Value);
703 }
704
705 /// LexQuote: String: "..."
LexQuote()706 AsmToken AsmLexer::LexQuote() {
707 int CurChar = getNextChar();
708 if (LexHLASMStrings)
709 return ReturnError(TokStart, "invalid usage of string literals");
710
711 if (LexMasmStrings) {
712 while (CurChar != EOF) {
713 if (CurChar != '"') {
714 CurChar = getNextChar();
715 } else if (peekNextChar() == '"') {
716 // In MASM double-quoted strings, doubled double-quotes mean an escaped
717 // double quote, so should be lexed in.
718 (void)getNextChar();
719 CurChar = getNextChar();
720 } else {
721 break;
722 }
723 }
724 if (CurChar == EOF)
725 return ReturnError(TokStart, "unterminated string constant");
726 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
727 }
728
729 while (CurChar != '"') {
730 if (CurChar == '\\') {
731 // Allow \", etc.
732 CurChar = getNextChar();
733 }
734
735 if (CurChar == EOF)
736 return ReturnError(TokStart, "unterminated string constant");
737
738 CurChar = getNextChar();
739 }
740
741 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
742 }
743
LexUntilEndOfStatement()744 StringRef AsmLexer::LexUntilEndOfStatement() {
745 TokStart = CurPtr;
746
747 while (!isAtStartOfComment(CurPtr) && // Start of line comment.
748 !isAtStatementSeparator(CurPtr) && // End of statement marker.
749 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
750 ++CurPtr;
751 }
752 return StringRef(TokStart, CurPtr-TokStart);
753 }
754
LexUntilEndOfLine()755 StringRef AsmLexer::LexUntilEndOfLine() {
756 TokStart = CurPtr;
757
758 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
759 ++CurPtr;
760 }
761 return StringRef(TokStart, CurPtr-TokStart);
762 }
763
peekTokens(MutableArrayRef<AsmToken> Buf,bool ShouldSkipSpace)764 size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
765 bool ShouldSkipSpace) {
766 SaveAndRestore SavedTokenStart(TokStart);
767 SaveAndRestore SavedCurPtr(CurPtr);
768 SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
769 SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
770 SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
771 SaveAndRestore SavedIsPeeking(IsPeeking, true);
772 std::string SavedErr = getErr();
773 SMLoc SavedErrLoc = getErrLoc();
774
775 size_t ReadCount;
776 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
777 AsmToken Token = LexToken();
778
779 Buf[ReadCount] = Token;
780
781 if (Token.is(AsmToken::Eof)) {
782 ReadCount++;
783 break;
784 }
785 }
786
787 SetError(SavedErrLoc, SavedErr);
788 return ReadCount;
789 }
790
isAtStartOfComment(const char * Ptr)791 bool AsmLexer::isAtStartOfComment(const char *Ptr) {
792 if (MAI.isHLASM() && !IsAtStartOfStatement)
793 return false;
794
795 StringRef CommentString = MAI.getCommentString();
796
797 if (CommentString.size() == 1)
798 return CommentString[0] == Ptr[0];
799
800 // Allow # preprocessor comments also be counted as comments for "##" cases
801 if (CommentString[1] == '#')
802 return CommentString[0] == Ptr[0];
803
804 return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0;
805 }
806
isAtStatementSeparator(const char * Ptr)807 bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
808 return strncmp(Ptr, MAI.getSeparatorString(),
809 strlen(MAI.getSeparatorString())) == 0;
810 }
811
LexToken()812 AsmToken AsmLexer::LexToken() {
813 TokStart = CurPtr;
814 // This always consumes at least one character.
815 int CurChar = getNextChar();
816
817 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
818 // If this starts with a '#', this may be a cpp
819 // hash directive and otherwise a line comment.
820 AsmToken TokenBuf[2];
821 MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
822 size_t num = peekTokens(Buf, true);
823 // There cannot be a space preceding this
824 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
825 TokenBuf[1].is(AsmToken::String)) {
826 CurPtr = TokStart; // reset curPtr;
827 StringRef s = LexUntilEndOfLine();
828 UnLex(TokenBuf[1]);
829 UnLex(TokenBuf[0]);
830 return AsmToken(AsmToken::HashDirective, s);
831 }
832
833 if (MAI.shouldAllowAdditionalComments())
834 return LexLineComment();
835 }
836
837 if (isAtStartOfComment(TokStart)) {
838 CurPtr += MAI.getCommentString().size() - 1;
839 return LexLineComment();
840 }
841
842 if (isAtStatementSeparator(TokStart)) {
843 CurPtr += strlen(MAI.getSeparatorString()) - 1;
844 IsAtStartOfLine = true;
845 IsAtStartOfStatement = true;
846 return AsmToken(AsmToken::EndOfStatement,
847 StringRef(TokStart, strlen(MAI.getSeparatorString())));
848 }
849
850 // If we're missing a newline at EOF, make sure we still get an
851 // EndOfStatement token before the Eof token.
852 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
853 IsAtStartOfLine = true;
854 IsAtStartOfStatement = true;
855 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
856 }
857 IsAtStartOfLine = false;
858 bool OldIsAtStartOfStatement = IsAtStartOfStatement;
859 IsAtStartOfStatement = false;
860 switch (CurChar) {
861 default:
862 // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
863 // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
864 // an identifier is target-dependent. These characters are handled in the
865 // respective switch cases.
866 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
867 return LexIdentifier();
868
869 // Unknown character, emit an error.
870 return ReturnError(TokStart, "invalid character in input");
871 case EOF:
872 if (EndStatementAtEOF) {
873 IsAtStartOfLine = true;
874 IsAtStartOfStatement = true;
875 }
876 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
877 case 0:
878 case ' ':
879 case '\t':
880 IsAtStartOfStatement = OldIsAtStartOfStatement;
881 while (*CurPtr == ' ' || *CurPtr == '\t')
882 CurPtr++;
883 if (SkipSpace)
884 return LexToken(); // Ignore whitespace.
885 else
886 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
887 case '\r': {
888 IsAtStartOfLine = true;
889 IsAtStartOfStatement = true;
890 // If this is a CR followed by LF, treat that as one token.
891 if (CurPtr != CurBuf.end() && *CurPtr == '\n')
892 ++CurPtr;
893 return AsmToken(AsmToken::EndOfStatement,
894 StringRef(TokStart, CurPtr - TokStart));
895 }
896 case '\n':
897 IsAtStartOfLine = true;
898 IsAtStartOfStatement = true;
899 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
900 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
901 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
902 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
903 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
904 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
905 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
906 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
907 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
908 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
909 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
910 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
911 case '$': {
912 if (LexMotorolaIntegers && isHexDigit(*CurPtr))
913 return LexDigit();
914 if (MAI.doesAllowDollarAtStartOfIdentifier())
915 return LexIdentifier();
916 return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
917 }
918 case '@':
919 if (MAI.doesAllowAtAtStartOfIdentifier())
920 return LexIdentifier();
921 return AsmToken(AsmToken::At, StringRef(TokStart, 1));
922 case '#':
923 if (MAI.isHLASM())
924 return LexIdentifier();
925 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
926 case '?':
927 if (MAI.doesAllowQuestionAtStartOfIdentifier())
928 return LexIdentifier();
929 return AsmToken(AsmToken::Question, StringRef(TokStart, 1));
930 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
931 case '=':
932 if (*CurPtr == '=') {
933 ++CurPtr;
934 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
935 }
936 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
937 case '-':
938 if (*CurPtr == '>') {
939 ++CurPtr;
940 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
941 }
942 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
943 case '|':
944 if (*CurPtr == '|') {
945 ++CurPtr;
946 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
947 }
948 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
949 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
950 case '&':
951 if (*CurPtr == '&') {
952 ++CurPtr;
953 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
954 }
955 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
956 case '!':
957 if (*CurPtr == '=') {
958 ++CurPtr;
959 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
960 }
961 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
962 case '%':
963 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
964 return LexDigit();
965 }
966 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
967 case '/':
968 IsAtStartOfStatement = OldIsAtStartOfStatement;
969 return LexSlash();
970 case '\'': return LexSingleQuote();
971 case '"': return LexQuote();
972 case '0': case '1': case '2': case '3': case '4':
973 case '5': case '6': case '7': case '8': case '9':
974 return LexDigit();
975 case '<':
976 switch (*CurPtr) {
977 case '<':
978 ++CurPtr;
979 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
980 case '=':
981 ++CurPtr;
982 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
983 case '>':
984 ++CurPtr;
985 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
986 default:
987 return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
988 }
989 case '>':
990 switch (*CurPtr) {
991 case '>':
992 ++CurPtr;
993 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
994 case '=':
995 ++CurPtr;
996 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
997 default:
998 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
999 }
1000
1001 // TODO: Quoted identifiers (objc methods etc)
1002 // local labels: [0-9][:]
1003 // Forward/backward labels: [0-9][fb]
1004 // Integers, fp constants, character constants.
1005 }
1006 }
1007