1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class implements the lexer for assembly files.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/MC/MCParser/AsmLexer.h"
14 #include "llvm/ADT/APInt.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/ADT/StringSwitch.h"
19 #include "llvm/MC/MCAsmInfo.h"
20 #include "llvm/MC/MCParser/MCAsmLexer.h"
21 #include "llvm/Support/Compiler.h"
22 #include "llvm/Support/SMLoc.h"
23 #include "llvm/Support/SaveAndRestore.h"
24 #include <cassert>
25 #include <cctype>
26 #include <cstdio>
27 #include <cstring>
28 #include <string>
29 #include <tuple>
30 #include <utility>
31
32 using namespace llvm;
33
AsmLexer(const MCAsmInfo & MAI)34 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
35 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with("@");
36 LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
37 }
38
39 AsmLexer::~AsmLexer() = default;
40
setBuffer(StringRef Buf,const char * ptr,bool EndStatementAtEOF)41 void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
42 bool EndStatementAtEOF) {
43 CurBuf = Buf;
44
45 if (ptr)
46 CurPtr = ptr;
47 else
48 CurPtr = CurBuf.begin();
49
50 TokStart = nullptr;
51 this->EndStatementAtEOF = EndStatementAtEOF;
52 }
53
54 /// ReturnError - Set the error to the specified string at the specified
55 /// location. This is defined to always return AsmToken::Error.
ReturnError(const char * Loc,const std::string & Msg)56 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
57 SetError(SMLoc::getFromPointer(Loc), Msg);
58
59 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
60 }
61
getNextChar()62 int AsmLexer::getNextChar() {
63 if (CurPtr == CurBuf.end())
64 return EOF;
65 return (unsigned char)*CurPtr++;
66 }
67
peekNextChar()68 int AsmLexer::peekNextChar() {
69 if (CurPtr == CurBuf.end())
70 return EOF;
71 return (unsigned char)*CurPtr;
72 }
73
74 /// The leading integral digit sequence and dot should have already been
75 /// consumed, some or all of the fractional digit sequence *can* have been
76 /// consumed.
LexFloatLiteral()77 AsmToken AsmLexer::LexFloatLiteral() {
78 // Skip the fractional digit sequence.
79 while (isDigit(*CurPtr))
80 ++CurPtr;
81
82 if (*CurPtr == '-' || *CurPtr == '+')
83 return ReturnError(CurPtr, "invalid sign in float literal");
84
85 // Check for exponent
86 if ((*CurPtr == 'e' || *CurPtr == 'E')) {
87 ++CurPtr;
88
89 if (*CurPtr == '-' || *CurPtr == '+')
90 ++CurPtr;
91
92 while (isDigit(*CurPtr))
93 ++CurPtr;
94 }
95
96 return AsmToken(AsmToken::Real,
97 StringRef(TokStart, CurPtr - TokStart));
98 }
99
100 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
101 /// while making sure there are enough actual digits around for the constant to
102 /// be valid.
103 ///
104 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
105 /// before we get here.
LexHexFloatLiteral(bool NoIntDigits)106 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
107 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
108 "unexpected parse state in floating hex");
109 bool NoFracDigits = true;
110
111 // Skip the fractional part if there is one
112 if (*CurPtr == '.') {
113 ++CurPtr;
114
115 const char *FracStart = CurPtr;
116 while (isHexDigit(*CurPtr))
117 ++CurPtr;
118
119 NoFracDigits = CurPtr == FracStart;
120 }
121
122 if (NoIntDigits && NoFracDigits)
123 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
124 "expected at least one significand digit");
125
126 // Make sure we do have some kind of proper exponent part
127 if (*CurPtr != 'p' && *CurPtr != 'P')
128 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
129 "expected exponent part 'p'");
130 ++CurPtr;
131
132 if (*CurPtr == '+' || *CurPtr == '-')
133 ++CurPtr;
134
135 // N.b. exponent digits are *not* hex
136 const char *ExpStart = CurPtr;
137 while (isDigit(*CurPtr))
138 ++CurPtr;
139
140 if (CurPtr == ExpStart)
141 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
142 "expected at least one exponent digit");
143
144 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
145 }
146
147 /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
isIdentifierChar(char C,bool AllowAt,bool AllowHash)148 static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
149 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
150 (AllowAt && C == '@') || (AllowHash && C == '#');
151 }
152
LexIdentifier()153 AsmToken AsmLexer::LexIdentifier() {
154 // Check for floating point literals.
155 if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
156 // Disambiguate a .1243foo identifier from a floating literal.
157 while (isDigit(*CurPtr))
158 ++CurPtr;
159
160 if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
161 AllowHashInIdentifier) ||
162 *CurPtr == 'e' || *CurPtr == 'E')
163 return LexFloatLiteral();
164 }
165
166 while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
167 ++CurPtr;
168
169 // Handle . as a special case.
170 if (CurPtr == TokStart+1 && TokStart[0] == '.')
171 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
172
173 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
174 }
175
176 /// LexSlash: Slash: /
177 /// C-Style Comment: /* ... */
178 /// C-style Comment: // ...
LexSlash()179 AsmToken AsmLexer::LexSlash() {
180 if (!MAI.shouldAllowAdditionalComments()) {
181 IsAtStartOfStatement = false;
182 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
183 }
184
185 switch (*CurPtr) {
186 case '*':
187 IsAtStartOfStatement = false;
188 break; // C style comment.
189 case '/':
190 ++CurPtr;
191 return LexLineComment();
192 default:
193 IsAtStartOfStatement = false;
194 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
195 }
196
197 // C Style comment.
198 ++CurPtr; // skip the star.
199 const char *CommentTextStart = CurPtr;
200 while (CurPtr != CurBuf.end()) {
201 switch (*CurPtr++) {
202 case '*':
203 // End of the comment?
204 if (*CurPtr != '/')
205 break;
206 // If we have a CommentConsumer, notify it about the comment.
207 if (CommentConsumer) {
208 CommentConsumer->HandleComment(
209 SMLoc::getFromPointer(CommentTextStart),
210 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
211 }
212 ++CurPtr; // End the */.
213 return AsmToken(AsmToken::Comment,
214 StringRef(TokStart, CurPtr - TokStart));
215 }
216 }
217 return ReturnError(TokStart, "unterminated comment");
218 }
219
220 /// LexLineComment: Comment: #[^\n]*
221 /// : //[^\n]*
LexLineComment()222 AsmToken AsmLexer::LexLineComment() {
223 // Mark This as an end of statement with a body of the
224 // comment. While it would be nicer to leave this two tokens,
225 // backwards compatability with TargetParsers makes keeping this in this form
226 // better.
227 const char *CommentTextStart = CurPtr;
228 int CurChar = getNextChar();
229 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
230 CurChar = getNextChar();
231 const char *NewlinePtr = CurPtr;
232 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
233 ++CurPtr;
234
235 // If we have a CommentConsumer, notify it about the comment.
236 if (CommentConsumer) {
237 CommentConsumer->HandleComment(
238 SMLoc::getFromPointer(CommentTextStart),
239 StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
240 }
241
242 IsAtStartOfLine = true;
243 // This is a whole line comment. leave newline
244 if (IsAtStartOfStatement)
245 return AsmToken(AsmToken::EndOfStatement,
246 StringRef(TokStart, CurPtr - TokStart));
247 IsAtStartOfStatement = true;
248
249 return AsmToken(AsmToken::EndOfStatement,
250 StringRef(TokStart, CurPtr - 1 - TokStart));
251 }
252
SkipIgnoredIntegerSuffix(const char * & CurPtr)253 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
254 // Skip case-insensitive ULL, UL, U, L and LL suffixes.
255 if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
256 ++CurPtr;
257 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
258 ++CurPtr;
259 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
260 ++CurPtr;
261 }
262
263 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
264 // integer as a hexadecimal, possibly with leading zeroes.
doHexLookAhead(const char * & CurPtr,unsigned DefaultRadix,bool LexHex)265 static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
266 bool LexHex) {
267 const char *FirstNonDec = nullptr;
268 const char *LookAhead = CurPtr;
269 while (true) {
270 if (isDigit(*LookAhead)) {
271 ++LookAhead;
272 } else {
273 if (!FirstNonDec)
274 FirstNonDec = LookAhead;
275
276 // Keep going if we are looking for a 'h' suffix.
277 if (LexHex && isHexDigit(*LookAhead))
278 ++LookAhead;
279 else
280 break;
281 }
282 }
283 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
284 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
285 if (isHex)
286 return 16;
287 return DefaultRadix;
288 }
289
findLastDigit(const char * CurPtr,unsigned DefaultRadix)290 static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
291 while (hexDigitValue(*CurPtr) < DefaultRadix) {
292 ++CurPtr;
293 }
294 return CurPtr;
295 }
296
intToken(StringRef Ref,APInt & Value)297 static AsmToken intToken(StringRef Ref, APInt &Value) {
298 if (Value.isIntN(64))
299 return AsmToken(AsmToken::Integer, Ref, Value);
300 return AsmToken(AsmToken::BigNum, Ref, Value);
301 }
302
radixName(unsigned Radix)303 static std::string radixName(unsigned Radix) {
304 switch (Radix) {
305 case 2:
306 return "binary";
307 case 8:
308 return "octal";
309 case 10:
310 return "decimal";
311 case 16:
312 return "hexadecimal";
313 default:
314 return "base-" + std::to_string(Radix);
315 }
316 }
317
318 /// LexDigit: First character is [0-9].
319 /// Local Label: [0-9][:]
320 /// Forward/Backward Label: [0-9][fb]
321 /// Binary integer: 0b[01]+
322 /// Octal integer: 0[0-7]+
323 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
324 /// Decimal integer: [1-9][0-9]*
LexDigit()325 AsmToken AsmLexer::LexDigit() {
326 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
327 // MASM-flavor octal integer: [0-7]+[oOqQ]
328 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
329 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
330 if (LexMasmIntegers && isdigit(CurPtr[-1])) {
331 const char *FirstNonBinary =
332 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
333 const char *FirstNonDecimal =
334 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
335 const char *OldCurPtr = CurPtr;
336 while (isHexDigit(*CurPtr)) {
337 switch (*CurPtr) {
338 default:
339 if (!FirstNonDecimal) {
340 FirstNonDecimal = CurPtr;
341 }
342 [[fallthrough]];
343 case '9':
344 case '8':
345 case '7':
346 case '6':
347 case '5':
348 case '4':
349 case '3':
350 case '2':
351 if (!FirstNonBinary) {
352 FirstNonBinary = CurPtr;
353 }
354 break;
355 case '1':
356 case '0':
357 break;
358 }
359 ++CurPtr;
360 }
361 if (*CurPtr == '.') {
362 // MASM float literals (other than hex floats) always contain a ".", and
363 // are always written in decimal.
364 ++CurPtr;
365 return LexFloatLiteral();
366 }
367
368 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
369 ++CurPtr;
370 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
371 }
372
373 unsigned Radix = 0;
374 if (*CurPtr == 'h' || *CurPtr == 'H') {
375 // hexadecimal number
376 ++CurPtr;
377 Radix = 16;
378 } else if (*CurPtr == 't' || *CurPtr == 'T') {
379 // decimal number
380 ++CurPtr;
381 Radix = 10;
382 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
383 *CurPtr == 'Q') {
384 // octal number
385 ++CurPtr;
386 Radix = 8;
387 } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
388 // binary number
389 ++CurPtr;
390 Radix = 2;
391 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
392 DefaultRadix < 14 &&
393 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
394 Radix = 10;
395 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
396 DefaultRadix < 12 &&
397 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
398 Radix = 2;
399 }
400
401 if (Radix) {
402 StringRef Result(TokStart, CurPtr - TokStart);
403 APInt Value(128, 0, true);
404
405 if (Result.drop_back().getAsInteger(Radix, Value))
406 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
407
408 // MSVC accepts and ignores type suffices on integer literals.
409 SkipIgnoredIntegerSuffix(CurPtr);
410
411 return intToken(Result, Value);
412 }
413
414 // default-radix integers, or floating point numbers, fall through
415 CurPtr = OldCurPtr;
416 }
417
418 // MASM default-radix integers: [0-9a-fA-F]+
419 // (All other integer literals have a radix specifier.)
420 if (LexMasmIntegers && UseMasmDefaultRadix) {
421 CurPtr = findLastDigit(CurPtr, 16);
422 StringRef Result(TokStart, CurPtr - TokStart);
423
424 APInt Value(128, 0, true);
425 if (Result.getAsInteger(DefaultRadix, Value)) {
426 return ReturnError(TokStart,
427 "invalid " + radixName(DefaultRadix) + " number");
428 }
429
430 return intToken(Result, Value);
431 }
432
433 // Motorola hex integers: $[0-9a-fA-F]+
434 if (LexMotorolaIntegers && CurPtr[-1] == '$') {
435 const char *NumStart = CurPtr;
436 while (isHexDigit(CurPtr[0]))
437 ++CurPtr;
438
439 APInt Result(128, 0);
440 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result))
441 return ReturnError(TokStart, "invalid hexadecimal number");
442
443 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
444 }
445
446 // Motorola binary integers: %[01]+
447 if (LexMotorolaIntegers && CurPtr[-1] == '%') {
448 const char *NumStart = CurPtr;
449 while (*CurPtr == '0' || *CurPtr == '1')
450 ++CurPtr;
451
452 APInt Result(128, 0);
453 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result))
454 return ReturnError(TokStart, "invalid binary number");
455
456 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
457 }
458
459 // Decimal integer: [1-9][0-9]*
460 // HLASM-flavour decimal integer: [0-9][0-9]*
461 // FIXME: Later on, support for fb for HLASM has to be added in
462 // as they probably would be needed for asm goto
463 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
464 unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
465
466 if (!LexHLASMIntegers) {
467 bool IsHex = Radix == 16;
468 // Check for floating point literals.
469 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
470 if (*CurPtr == '.')
471 ++CurPtr;
472 return LexFloatLiteral();
473 }
474 }
475
476 StringRef Result(TokStart, CurPtr - TokStart);
477
478 APInt Value(128, 0, true);
479 if (Result.getAsInteger(Radix, Value))
480 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
481
482 if (!LexHLASMIntegers)
483 // The darwin/x86 (and x86-64) assembler accepts and ignores type
484 // suffices on integer literals.
485 SkipIgnoredIntegerSuffix(CurPtr);
486
487 return intToken(Result, Value);
488 }
489
490 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
491 ++CurPtr;
492 // See if we actually have "0b" as part of something like "jmp 0b\n"
493 if (!isDigit(CurPtr[0])) {
494 --CurPtr;
495 StringRef Result(TokStart, CurPtr - TokStart);
496 return AsmToken(AsmToken::Integer, Result, 0);
497 }
498 const char *NumStart = CurPtr;
499 while (CurPtr[0] == '0' || CurPtr[0] == '1')
500 ++CurPtr;
501
502 // Requires at least one binary digit.
503 if (CurPtr == NumStart)
504 return ReturnError(TokStart, "invalid binary number");
505
506 StringRef Result(TokStart, CurPtr - TokStart);
507
508 APInt Value(128, 0, true);
509 if (Result.substr(2).getAsInteger(2, Value))
510 return ReturnError(TokStart, "invalid binary number");
511
512 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
513 // suffixes on integer literals.
514 SkipIgnoredIntegerSuffix(CurPtr);
515
516 return intToken(Result, Value);
517 }
518
519 if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
520 ++CurPtr;
521 const char *NumStart = CurPtr;
522 while (isHexDigit(CurPtr[0]))
523 ++CurPtr;
524
525 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
526 // diagnosed by LexHexFloatLiteral).
527 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
528 return LexHexFloatLiteral(NumStart == CurPtr);
529
530 // Otherwise requires at least one hex digit.
531 if (CurPtr == NumStart)
532 return ReturnError(CurPtr-2, "invalid hexadecimal number");
533
534 APInt Result(128, 0);
535 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
536 return ReturnError(TokStart, "invalid hexadecimal number");
537
538 // Consume the optional [hH].
539 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
540 ++CurPtr;
541
542 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
543 // suffixes on integer literals.
544 SkipIgnoredIntegerSuffix(CurPtr);
545
546 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
547 }
548
549 // Either octal or hexadecimal.
550 APInt Value(128, 0, true);
551 unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
552 StringRef Result(TokStart, CurPtr - TokStart);
553 if (Result.getAsInteger(Radix, Value))
554 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
555
556 // Consume the [hH].
557 if (Radix == 16)
558 ++CurPtr;
559
560 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
561 // suffixes on integer literals.
562 SkipIgnoredIntegerSuffix(CurPtr);
563
564 return intToken(Result, Value);
565 }
566
567 /// LexSingleQuote: Integer: 'b'
LexSingleQuote()568 AsmToken AsmLexer::LexSingleQuote() {
569 int CurChar = getNextChar();
570
571 if (LexHLASMStrings)
572 return ReturnError(TokStart, "invalid usage of character literals");
573
574 if (LexMasmStrings) {
575 while (CurChar != EOF) {
576 if (CurChar != '\'') {
577 CurChar = getNextChar();
578 } else if (peekNextChar() == '\'') {
579 // In MASM single-quote strings, doubled single-quotes mean an escaped
580 // single quote, so should be lexed in.
581 (void)getNextChar();
582 CurChar = getNextChar();
583 } else {
584 break;
585 }
586 }
587 if (CurChar == EOF)
588 return ReturnError(TokStart, "unterminated string constant");
589 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
590 }
591
592 if (CurChar == '\\')
593 CurChar = getNextChar();
594
595 if (CurChar == EOF)
596 return ReturnError(TokStart, "unterminated single quote");
597
598 CurChar = getNextChar();
599
600 if (CurChar != '\'')
601 return ReturnError(TokStart, "single quote way too long");
602
603 // The idea here being that 'c' is basically just an integral
604 // constant.
605 StringRef Res = StringRef(TokStart,CurPtr - TokStart);
606 long long Value;
607
608 if (Res.starts_with("\'\\")) {
609 char theChar = Res[2];
610 switch (theChar) {
611 default: Value = theChar; break;
612 case '\'': Value = '\''; break;
613 case 't': Value = '\t'; break;
614 case 'n': Value = '\n'; break;
615 case 'b': Value = '\b'; break;
616 case 'f': Value = '\f'; break;
617 case 'r': Value = '\r'; break;
618 }
619 } else
620 Value = TokStart[1];
621
622 return AsmToken(AsmToken::Integer, Res, Value);
623 }
624
625 /// LexQuote: String: "..."
LexQuote()626 AsmToken AsmLexer::LexQuote() {
627 int CurChar = getNextChar();
628 if (LexHLASMStrings)
629 return ReturnError(TokStart, "invalid usage of string literals");
630
631 if (LexMasmStrings) {
632 while (CurChar != EOF) {
633 if (CurChar != '"') {
634 CurChar = getNextChar();
635 } else if (peekNextChar() == '"') {
636 // In MASM double-quoted strings, doubled double-quotes mean an escaped
637 // double quote, so should be lexed in.
638 (void)getNextChar();
639 CurChar = getNextChar();
640 } else {
641 break;
642 }
643 }
644 if (CurChar == EOF)
645 return ReturnError(TokStart, "unterminated string constant");
646 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
647 }
648
649 while (CurChar != '"') {
650 if (CurChar == '\\') {
651 // Allow \", etc.
652 CurChar = getNextChar();
653 }
654
655 if (CurChar == EOF)
656 return ReturnError(TokStart, "unterminated string constant");
657
658 CurChar = getNextChar();
659 }
660
661 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
662 }
663
LexUntilEndOfStatement()664 StringRef AsmLexer::LexUntilEndOfStatement() {
665 TokStart = CurPtr;
666
667 while (!isAtStartOfComment(CurPtr) && // Start of line comment.
668 !isAtStatementSeparator(CurPtr) && // End of statement marker.
669 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
670 ++CurPtr;
671 }
672 return StringRef(TokStart, CurPtr-TokStart);
673 }
674
LexUntilEndOfLine()675 StringRef AsmLexer::LexUntilEndOfLine() {
676 TokStart = CurPtr;
677
678 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
679 ++CurPtr;
680 }
681 return StringRef(TokStart, CurPtr-TokStart);
682 }
683
peekTokens(MutableArrayRef<AsmToken> Buf,bool ShouldSkipSpace)684 size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
685 bool ShouldSkipSpace) {
686 SaveAndRestore SavedTokenStart(TokStart);
687 SaveAndRestore SavedCurPtr(CurPtr);
688 SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
689 SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
690 SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
691 SaveAndRestore SavedIsPeeking(IsPeeking, true);
692 std::string SavedErr = getErr();
693 SMLoc SavedErrLoc = getErrLoc();
694
695 size_t ReadCount;
696 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
697 AsmToken Token = LexToken();
698
699 Buf[ReadCount] = Token;
700
701 if (Token.is(AsmToken::Eof))
702 break;
703 }
704
705 SetError(SavedErrLoc, SavedErr);
706 return ReadCount;
707 }
708
isAtStartOfComment(const char * Ptr)709 bool AsmLexer::isAtStartOfComment(const char *Ptr) {
710 if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement)
711 return false;
712
713 StringRef CommentString = MAI.getCommentString();
714
715 if (CommentString.size() == 1)
716 return CommentString[0] == Ptr[0];
717
718 // Allow # preprocessor comments also be counted as comments for "##" cases
719 if (CommentString[1] == '#')
720 return CommentString[0] == Ptr[0];
721
722 return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0;
723 }
724
isAtStatementSeparator(const char * Ptr)725 bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
726 return strncmp(Ptr, MAI.getSeparatorString(),
727 strlen(MAI.getSeparatorString())) == 0;
728 }
729
LexToken()730 AsmToken AsmLexer::LexToken() {
731 TokStart = CurPtr;
732 // This always consumes at least one character.
733 int CurChar = getNextChar();
734
735 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
736 // If this starts with a '#', this may be a cpp
737 // hash directive and otherwise a line comment.
738 AsmToken TokenBuf[2];
739 MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
740 size_t num = peekTokens(Buf, true);
741 // There cannot be a space preceding this
742 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
743 TokenBuf[1].is(AsmToken::String)) {
744 CurPtr = TokStart; // reset curPtr;
745 StringRef s = LexUntilEndOfLine();
746 UnLex(TokenBuf[1]);
747 UnLex(TokenBuf[0]);
748 return AsmToken(AsmToken::HashDirective, s);
749 }
750
751 if (MAI.shouldAllowAdditionalComments())
752 return LexLineComment();
753 }
754
755 if (isAtStartOfComment(TokStart))
756 return LexLineComment();
757
758 if (isAtStatementSeparator(TokStart)) {
759 CurPtr += strlen(MAI.getSeparatorString()) - 1;
760 IsAtStartOfLine = true;
761 IsAtStartOfStatement = true;
762 return AsmToken(AsmToken::EndOfStatement,
763 StringRef(TokStart, strlen(MAI.getSeparatorString())));
764 }
765
766 // If we're missing a newline at EOF, make sure we still get an
767 // EndOfStatement token before the Eof token.
768 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
769 IsAtStartOfLine = true;
770 IsAtStartOfStatement = true;
771 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
772 }
773 IsAtStartOfLine = false;
774 bool OldIsAtStartOfStatement = IsAtStartOfStatement;
775 IsAtStartOfStatement = false;
776 switch (CurChar) {
777 default:
778 // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
779 // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
780 // an identifier is target-dependent. These characters are handled in the
781 // respective switch cases.
782 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
783 return LexIdentifier();
784
785 // Unknown character, emit an error.
786 return ReturnError(TokStart, "invalid character in input");
787 case EOF:
788 if (EndStatementAtEOF) {
789 IsAtStartOfLine = true;
790 IsAtStartOfStatement = true;
791 }
792 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
793 case 0:
794 case ' ':
795 case '\t':
796 IsAtStartOfStatement = OldIsAtStartOfStatement;
797 while (*CurPtr == ' ' || *CurPtr == '\t')
798 CurPtr++;
799 if (SkipSpace)
800 return LexToken(); // Ignore whitespace.
801 else
802 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
803 case '\r': {
804 IsAtStartOfLine = true;
805 IsAtStartOfStatement = true;
806 // If this is a CR followed by LF, treat that as one token.
807 if (CurPtr != CurBuf.end() && *CurPtr == '\n')
808 ++CurPtr;
809 return AsmToken(AsmToken::EndOfStatement,
810 StringRef(TokStart, CurPtr - TokStart));
811 }
812 case '\n':
813 IsAtStartOfLine = true;
814 IsAtStartOfStatement = true;
815 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
816 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
817 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
818 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
819 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
820 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
821 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
822 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
823 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
824 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
825 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
826 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
827 case '$': {
828 if (LexMotorolaIntegers && isHexDigit(*CurPtr))
829 return LexDigit();
830 if (MAI.doesAllowDollarAtStartOfIdentifier())
831 return LexIdentifier();
832 return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
833 }
834 case '@':
835 if (MAI.doesAllowAtAtStartOfIdentifier())
836 return LexIdentifier();
837 return AsmToken(AsmToken::At, StringRef(TokStart, 1));
838 case '#':
839 if (MAI.doesAllowHashAtStartOfIdentifier())
840 return LexIdentifier();
841 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
842 case '?':
843 if (MAI.doesAllowQuestionAtStartOfIdentifier())
844 return LexIdentifier();
845 return AsmToken(AsmToken::Question, StringRef(TokStart, 1));
846 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
847 case '=':
848 if (*CurPtr == '=') {
849 ++CurPtr;
850 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
851 }
852 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
853 case '-':
854 if (*CurPtr == '>') {
855 ++CurPtr;
856 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
857 }
858 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
859 case '|':
860 if (*CurPtr == '|') {
861 ++CurPtr;
862 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
863 }
864 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
865 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
866 case '&':
867 if (*CurPtr == '&') {
868 ++CurPtr;
869 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
870 }
871 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
872 case '!':
873 if (*CurPtr == '=') {
874 ++CurPtr;
875 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
876 }
877 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
878 case '%':
879 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
880 return LexDigit();
881 }
882
883 if (MAI.hasMipsExpressions()) {
884 AsmToken::TokenKind Operator;
885 unsigned OperatorLength;
886
887 std::tie(Operator, OperatorLength) =
888 StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>(
889 StringRef(CurPtr))
890 .StartsWith("call16", {AsmToken::PercentCall16, 7})
891 .StartsWith("call_hi", {AsmToken::PercentCall_Hi, 8})
892 .StartsWith("call_lo", {AsmToken::PercentCall_Lo, 8})
893 .StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi, 10})
894 .StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo, 10})
895 .StartsWith("got_disp", {AsmToken::PercentGot_Disp, 9})
896 .StartsWith("got_hi", {AsmToken::PercentGot_Hi, 7})
897 .StartsWith("got_lo", {AsmToken::PercentGot_Lo, 7})
898 .StartsWith("got_ofst", {AsmToken::PercentGot_Ofst, 9})
899 .StartsWith("got_page", {AsmToken::PercentGot_Page, 9})
900 .StartsWith("gottprel", {AsmToken::PercentGottprel, 9})
901 .StartsWith("got", {AsmToken::PercentGot, 4})
902 .StartsWith("gp_rel", {AsmToken::PercentGp_Rel, 7})
903 .StartsWith("higher", {AsmToken::PercentHigher, 7})
904 .StartsWith("highest", {AsmToken::PercentHighest, 8})
905 .StartsWith("hi", {AsmToken::PercentHi, 3})
906 .StartsWith("lo", {AsmToken::PercentLo, 3})
907 .StartsWith("neg", {AsmToken::PercentNeg, 4})
908 .StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi, 9})
909 .StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo, 9})
910 .StartsWith("tlsgd", {AsmToken::PercentTlsgd, 6})
911 .StartsWith("tlsldm", {AsmToken::PercentTlsldm, 7})
912 .StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi, 9})
913 .StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo, 9})
914 .Default({AsmToken::Percent, 1});
915
916 if (Operator != AsmToken::Percent) {
917 CurPtr += OperatorLength - 1;
918 return AsmToken(Operator, StringRef(TokStart, OperatorLength));
919 }
920 }
921 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
922 case '/':
923 IsAtStartOfStatement = OldIsAtStartOfStatement;
924 return LexSlash();
925 case '\'': return LexSingleQuote();
926 case '"': return LexQuote();
927 case '0': case '1': case '2': case '3': case '4':
928 case '5': case '6': case '7': case '8': case '9':
929 return LexDigit();
930 case '<':
931 switch (*CurPtr) {
932 case '<':
933 ++CurPtr;
934 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
935 case '=':
936 ++CurPtr;
937 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
938 case '>':
939 ++CurPtr;
940 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
941 default:
942 return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
943 }
944 case '>':
945 switch (*CurPtr) {
946 case '>':
947 ++CurPtr;
948 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
949 case '=':
950 ++CurPtr;
951 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
952 default:
953 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
954 }
955
956 // TODO: Quoted identifiers (objc methods etc)
957 // local labels: [0-9][:]
958 // Forward/backward labels: [0-9][fb]
959 // Integers, fp constants, character constants.
960 }
961 }
962