xref: /freebsd/contrib/llvm-project/llvm/lib/MC/MCParser/AsmLexer.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class implements the lexer for assembly files.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/MC/MCParser/AsmLexer.h"
14 #include "llvm/ADT/APInt.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/ADT/StringSwitch.h"
19 #include "llvm/MC/MCAsmInfo.h"
20 #include "llvm/MC/MCParser/MCAsmLexer.h"
21 #include "llvm/Support/Compiler.h"
22 #include "llvm/Support/SMLoc.h"
23 #include "llvm/Support/SaveAndRestore.h"
24 #include <cassert>
25 #include <cctype>
26 #include <cstdio>
27 #include <cstring>
28 #include <string>
29 #include <tuple>
30 #include <utility>
31 
32 using namespace llvm;
33 
AsmLexer(const MCAsmInfo & MAI)34 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
35   AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with("@");
36   LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
37 }
38 
39 AsmLexer::~AsmLexer() = default;
40 
setBuffer(StringRef Buf,const char * ptr,bool EndStatementAtEOF)41 void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
42                          bool EndStatementAtEOF) {
43   CurBuf = Buf;
44 
45   if (ptr)
46     CurPtr = ptr;
47   else
48     CurPtr = CurBuf.begin();
49 
50   TokStart = nullptr;
51   this->EndStatementAtEOF = EndStatementAtEOF;
52 }
53 
54 /// ReturnError - Set the error to the specified string at the specified
55 /// location.  This is defined to always return AsmToken::Error.
ReturnError(const char * Loc,const std::string & Msg)56 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
57   SetError(SMLoc::getFromPointer(Loc), Msg);
58 
59   return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
60 }
61 
getNextChar()62 int AsmLexer::getNextChar() {
63   if (CurPtr == CurBuf.end())
64     return EOF;
65   return (unsigned char)*CurPtr++;
66 }
67 
peekNextChar()68 int AsmLexer::peekNextChar() {
69   if (CurPtr == CurBuf.end())
70     return EOF;
71   return (unsigned char)*CurPtr;
72 }
73 
74 /// The leading integral digit sequence and dot should have already been
75 /// consumed, some or all of the fractional digit sequence *can* have been
76 /// consumed.
LexFloatLiteral()77 AsmToken AsmLexer::LexFloatLiteral() {
78   // Skip the fractional digit sequence.
79   while (isDigit(*CurPtr))
80     ++CurPtr;
81 
82   if (*CurPtr == '-' || *CurPtr == '+')
83     return ReturnError(CurPtr, "invalid sign in float literal");
84 
85   // Check for exponent
86   if ((*CurPtr == 'e' || *CurPtr == 'E')) {
87     ++CurPtr;
88 
89     if (*CurPtr == '-' || *CurPtr == '+')
90       ++CurPtr;
91 
92     while (isDigit(*CurPtr))
93       ++CurPtr;
94   }
95 
96   return AsmToken(AsmToken::Real,
97                   StringRef(TokStart, CurPtr - TokStart));
98 }
99 
100 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
101 /// while making sure there are enough actual digits around for the constant to
102 /// be valid.
103 ///
104 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
105 /// before we get here.
LexHexFloatLiteral(bool NoIntDigits)106 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
107   assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
108          "unexpected parse state in floating hex");
109   bool NoFracDigits = true;
110 
111   // Skip the fractional part if there is one
112   if (*CurPtr == '.') {
113     ++CurPtr;
114 
115     const char *FracStart = CurPtr;
116     while (isHexDigit(*CurPtr))
117       ++CurPtr;
118 
119     NoFracDigits = CurPtr == FracStart;
120   }
121 
122   if (NoIntDigits && NoFracDigits)
123     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
124                                  "expected at least one significand digit");
125 
126   // Make sure we do have some kind of proper exponent part
127   if (*CurPtr != 'p' && *CurPtr != 'P')
128     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
129                                  "expected exponent part 'p'");
130   ++CurPtr;
131 
132   if (*CurPtr == '+' || *CurPtr == '-')
133     ++CurPtr;
134 
135   // N.b. exponent digits are *not* hex
136   const char *ExpStart = CurPtr;
137   while (isDigit(*CurPtr))
138     ++CurPtr;
139 
140   if (CurPtr == ExpStart)
141     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
142                                  "expected at least one exponent digit");
143 
144   return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
145 }
146 
147 /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
isIdentifierChar(char C,bool AllowAt,bool AllowHash)148 static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
149   return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
150          (AllowAt && C == '@') || (AllowHash && C == '#');
151 }
152 
LexIdentifier()153 AsmToken AsmLexer::LexIdentifier() {
154   // Check for floating point literals.
155   if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
156     // Disambiguate a .1243foo identifier from a floating literal.
157     while (isDigit(*CurPtr))
158       ++CurPtr;
159 
160     if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
161                           AllowHashInIdentifier) ||
162         *CurPtr == 'e' || *CurPtr == 'E')
163       return LexFloatLiteral();
164   }
165 
166   while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
167     ++CurPtr;
168 
169   // Handle . as a special case.
170   if (CurPtr == TokStart+1 && TokStart[0] == '.')
171     return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
172 
173   return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
174 }
175 
176 /// LexSlash: Slash: /
177 ///           C-Style Comment: /* ... */
178 ///           C-style Comment: // ...
LexSlash()179 AsmToken AsmLexer::LexSlash() {
180   if (!MAI.shouldAllowAdditionalComments()) {
181     IsAtStartOfStatement = false;
182     return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
183   }
184 
185   switch (*CurPtr) {
186   case '*':
187     IsAtStartOfStatement = false;
188     break; // C style comment.
189   case '/':
190     ++CurPtr;
191     return LexLineComment();
192   default:
193     IsAtStartOfStatement = false;
194     return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
195   }
196 
197   // C Style comment.
198   ++CurPtr;  // skip the star.
199   const char *CommentTextStart = CurPtr;
200   while (CurPtr != CurBuf.end()) {
201     switch (*CurPtr++) {
202     case '*':
203       // End of the comment?
204       if (*CurPtr != '/')
205         break;
206       // If we have a CommentConsumer, notify it about the comment.
207       if (CommentConsumer) {
208         CommentConsumer->HandleComment(
209             SMLoc::getFromPointer(CommentTextStart),
210             StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
211       }
212       ++CurPtr;   // End the */.
213       return AsmToken(AsmToken::Comment,
214                       StringRef(TokStart, CurPtr - TokStart));
215     }
216   }
217   return ReturnError(TokStart, "unterminated comment");
218 }
219 
220 /// LexLineComment: Comment: #[^\n]*
221 ///                        : //[^\n]*
LexLineComment()222 AsmToken AsmLexer::LexLineComment() {
223   // Mark This as an end of statement with a body of the
224   // comment. While it would be nicer to leave this two tokens,
225   // backwards compatability with TargetParsers makes keeping this in this form
226   // better.
227   const char *CommentTextStart = CurPtr;
228   int CurChar = getNextChar();
229   while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
230     CurChar = getNextChar();
231   const char *NewlinePtr = CurPtr;
232   if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
233     ++CurPtr;
234 
235   // If we have a CommentConsumer, notify it about the comment.
236   if (CommentConsumer) {
237     CommentConsumer->HandleComment(
238         SMLoc::getFromPointer(CommentTextStart),
239         StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
240   }
241 
242   IsAtStartOfLine = true;
243   // This is a whole line comment. leave newline
244   if (IsAtStartOfStatement)
245     return AsmToken(AsmToken::EndOfStatement,
246                     StringRef(TokStart, CurPtr - TokStart));
247   IsAtStartOfStatement = true;
248 
249   return AsmToken(AsmToken::EndOfStatement,
250                   StringRef(TokStart, CurPtr - 1 - TokStart));
251 }
252 
SkipIgnoredIntegerSuffix(const char * & CurPtr)253 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
254   // Skip case-insensitive ULL, UL, U, L and LL suffixes.
255   if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
256     ++CurPtr;
257   if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
258     ++CurPtr;
259   if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
260     ++CurPtr;
261 }
262 
263 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
264 // integer as a hexadecimal, possibly with leading zeroes.
doHexLookAhead(const char * & CurPtr,unsigned DefaultRadix,bool LexHex)265 static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
266                                bool LexHex) {
267   const char *FirstNonDec = nullptr;
268   const char *LookAhead = CurPtr;
269   while (true) {
270     if (isDigit(*LookAhead)) {
271       ++LookAhead;
272     } else {
273       if (!FirstNonDec)
274         FirstNonDec = LookAhead;
275 
276       // Keep going if we are looking for a 'h' suffix.
277       if (LexHex && isHexDigit(*LookAhead))
278         ++LookAhead;
279       else
280         break;
281     }
282   }
283   bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
284   CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
285   if (isHex)
286     return 16;
287   return DefaultRadix;
288 }
289 
findLastDigit(const char * CurPtr,unsigned DefaultRadix)290 static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
291   while (hexDigitValue(*CurPtr) < DefaultRadix) {
292     ++CurPtr;
293   }
294   return CurPtr;
295 }
296 
intToken(StringRef Ref,APInt & Value)297 static AsmToken intToken(StringRef Ref, APInt &Value) {
298   if (Value.isIntN(64))
299     return AsmToken(AsmToken::Integer, Ref, Value);
300   return AsmToken(AsmToken::BigNum, Ref, Value);
301 }
302 
radixName(unsigned Radix)303 static std::string radixName(unsigned Radix) {
304   switch (Radix) {
305   case 2:
306     return "binary";
307   case 8:
308     return "octal";
309   case 10:
310     return "decimal";
311   case 16:
312     return "hexadecimal";
313   default:
314     return "base-" + std::to_string(Radix);
315   }
316 }
317 
318 /// LexDigit: First character is [0-9].
319 ///   Local Label: [0-9][:]
320 ///   Forward/Backward Label: [0-9][fb]
321 ///   Binary integer: 0b[01]+
322 ///   Octal integer: 0[0-7]+
323 ///   Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
324 ///   Decimal integer: [1-9][0-9]*
LexDigit()325 AsmToken AsmLexer::LexDigit() {
326   // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
327   // MASM-flavor octal integer: [0-7]+[oOqQ]
328   // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
329   // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
330   if (LexMasmIntegers && isdigit(CurPtr[-1])) {
331     const char *FirstNonBinary =
332         (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
333     const char *FirstNonDecimal =
334         (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
335     const char *OldCurPtr = CurPtr;
336     while (isHexDigit(*CurPtr)) {
337       switch (*CurPtr) {
338       default:
339         if (!FirstNonDecimal) {
340           FirstNonDecimal = CurPtr;
341         }
342         [[fallthrough]];
343       case '9':
344       case '8':
345       case '7':
346       case '6':
347       case '5':
348       case '4':
349       case '3':
350       case '2':
351         if (!FirstNonBinary) {
352           FirstNonBinary = CurPtr;
353         }
354         break;
355       case '1':
356       case '0':
357         break;
358       }
359       ++CurPtr;
360     }
361     if (*CurPtr == '.') {
362       // MASM float literals (other than hex floats) always contain a ".", and
363       // are always written in decimal.
364       ++CurPtr;
365       return LexFloatLiteral();
366     }
367 
368     if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
369       ++CurPtr;
370       return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
371     }
372 
373     unsigned Radix = 0;
374     if (*CurPtr == 'h' || *CurPtr == 'H') {
375       // hexadecimal number
376       ++CurPtr;
377       Radix = 16;
378     } else if (*CurPtr == 't' || *CurPtr == 'T') {
379       // decimal number
380       ++CurPtr;
381       Radix = 10;
382     } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
383                *CurPtr == 'Q') {
384       // octal number
385       ++CurPtr;
386       Radix = 8;
387     } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
388       // binary number
389       ++CurPtr;
390       Radix = 2;
391     } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
392                DefaultRadix < 14 &&
393                (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
394       Radix = 10;
395     } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
396                DefaultRadix < 12 &&
397                (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
398       Radix = 2;
399     }
400 
401     if (Radix) {
402       StringRef Result(TokStart, CurPtr - TokStart);
403       APInt Value(128, 0, true);
404 
405       if (Result.drop_back().getAsInteger(Radix, Value))
406         return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
407 
408       // MSVC accepts and ignores type suffices on integer literals.
409       SkipIgnoredIntegerSuffix(CurPtr);
410 
411       return intToken(Result, Value);
412     }
413 
414     // default-radix integers, or floating point numbers, fall through
415     CurPtr = OldCurPtr;
416   }
417 
418   // MASM default-radix integers: [0-9a-fA-F]+
419   // (All other integer literals have a radix specifier.)
420   if (LexMasmIntegers && UseMasmDefaultRadix) {
421     CurPtr = findLastDigit(CurPtr, 16);
422     StringRef Result(TokStart, CurPtr - TokStart);
423 
424     APInt Value(128, 0, true);
425     if (Result.getAsInteger(DefaultRadix, Value)) {
426       return ReturnError(TokStart,
427                          "invalid " + radixName(DefaultRadix) + " number");
428     }
429 
430     return intToken(Result, Value);
431   }
432 
433   // Motorola hex integers: $[0-9a-fA-F]+
434   if (LexMotorolaIntegers && CurPtr[-1] == '$') {
435     const char *NumStart = CurPtr;
436     while (isHexDigit(CurPtr[0]))
437       ++CurPtr;
438 
439     APInt Result(128, 0);
440     if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result))
441       return ReturnError(TokStart, "invalid hexadecimal number");
442 
443     return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
444   }
445 
446   // Motorola binary integers: %[01]+
447   if (LexMotorolaIntegers && CurPtr[-1] == '%') {
448     const char *NumStart = CurPtr;
449     while (*CurPtr == '0' || *CurPtr == '1')
450       ++CurPtr;
451 
452     APInt Result(128, 0);
453     if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result))
454       return ReturnError(TokStart, "invalid binary number");
455 
456     return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
457   }
458 
459   // Decimal integer: [1-9][0-9]*
460   // HLASM-flavour decimal integer: [0-9][0-9]*
461   // FIXME: Later on, support for fb for HLASM has to be added in
462   // as they probably would be needed for asm goto
463   if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
464     unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
465 
466     if (!LexHLASMIntegers) {
467       bool IsHex = Radix == 16;
468       // Check for floating point literals.
469       if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
470         if (*CurPtr == '.')
471           ++CurPtr;
472         return LexFloatLiteral();
473       }
474     }
475 
476     StringRef Result(TokStart, CurPtr - TokStart);
477 
478     APInt Value(128, 0, true);
479     if (Result.getAsInteger(Radix, Value))
480       return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
481 
482     if (!LexHLASMIntegers)
483       // The darwin/x86 (and x86-64) assembler accepts and ignores type
484       // suffices on integer literals.
485       SkipIgnoredIntegerSuffix(CurPtr);
486 
487     return intToken(Result, Value);
488   }
489 
490   if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
491     ++CurPtr;
492     // See if we actually have "0b" as part of something like "jmp 0b\n"
493     if (!isDigit(CurPtr[0])) {
494       --CurPtr;
495       StringRef Result(TokStart, CurPtr - TokStart);
496       return AsmToken(AsmToken::Integer, Result, 0);
497     }
498     const char *NumStart = CurPtr;
499     while (CurPtr[0] == '0' || CurPtr[0] == '1')
500       ++CurPtr;
501 
502     // Requires at least one binary digit.
503     if (CurPtr == NumStart)
504       return ReturnError(TokStart, "invalid binary number");
505 
506     StringRef Result(TokStart, CurPtr - TokStart);
507 
508     APInt Value(128, 0, true);
509     if (Result.substr(2).getAsInteger(2, Value))
510       return ReturnError(TokStart, "invalid binary number");
511 
512     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
513     // suffixes on integer literals.
514     SkipIgnoredIntegerSuffix(CurPtr);
515 
516     return intToken(Result, Value);
517   }
518 
519   if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
520     ++CurPtr;
521     const char *NumStart = CurPtr;
522     while (isHexDigit(CurPtr[0]))
523       ++CurPtr;
524 
525     // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
526     // diagnosed by LexHexFloatLiteral).
527     if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
528       return LexHexFloatLiteral(NumStart == CurPtr);
529 
530     // Otherwise requires at least one hex digit.
531     if (CurPtr == NumStart)
532       return ReturnError(CurPtr-2, "invalid hexadecimal number");
533 
534     APInt Result(128, 0);
535     if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
536       return ReturnError(TokStart, "invalid hexadecimal number");
537 
538     // Consume the optional [hH].
539     if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
540       ++CurPtr;
541 
542     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
543     // suffixes on integer literals.
544     SkipIgnoredIntegerSuffix(CurPtr);
545 
546     return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
547   }
548 
549   // Either octal or hexadecimal.
550   APInt Value(128, 0, true);
551   unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
552   StringRef Result(TokStart, CurPtr - TokStart);
553   if (Result.getAsInteger(Radix, Value))
554     return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
555 
556   // Consume the [hH].
557   if (Radix == 16)
558     ++CurPtr;
559 
560   // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
561   // suffixes on integer literals.
562   SkipIgnoredIntegerSuffix(CurPtr);
563 
564   return intToken(Result, Value);
565 }
566 
567 /// LexSingleQuote: Integer: 'b'
LexSingleQuote()568 AsmToken AsmLexer::LexSingleQuote() {
569   int CurChar = getNextChar();
570 
571   if (LexHLASMStrings)
572     return ReturnError(TokStart, "invalid usage of character literals");
573 
574   if (LexMasmStrings) {
575     while (CurChar != EOF) {
576       if (CurChar != '\'') {
577         CurChar = getNextChar();
578       } else if (peekNextChar() == '\'') {
579         // In MASM single-quote strings, doubled single-quotes mean an escaped
580         // single quote, so should be lexed in.
581         (void)getNextChar();
582         CurChar = getNextChar();
583       } else {
584         break;
585       }
586     }
587     if (CurChar == EOF)
588       return ReturnError(TokStart, "unterminated string constant");
589     return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
590   }
591 
592   if (CurChar == '\\')
593     CurChar = getNextChar();
594 
595   if (CurChar == EOF)
596     return ReturnError(TokStart, "unterminated single quote");
597 
598   CurChar = getNextChar();
599 
600   if (CurChar != '\'')
601     return ReturnError(TokStart, "single quote way too long");
602 
603   // The idea here being that 'c' is basically just an integral
604   // constant.
605   StringRef Res = StringRef(TokStart,CurPtr - TokStart);
606   long long Value;
607 
608   if (Res.starts_with("\'\\")) {
609     char theChar = Res[2];
610     switch (theChar) {
611       default: Value = theChar; break;
612       case '\'': Value = '\''; break;
613       case 't': Value = '\t'; break;
614       case 'n': Value = '\n'; break;
615       case 'b': Value = '\b'; break;
616       case 'f': Value = '\f'; break;
617       case 'r': Value = '\r'; break;
618     }
619   } else
620     Value = TokStart[1];
621 
622   return AsmToken(AsmToken::Integer, Res, Value);
623 }
624 
625 /// LexQuote: String: "..."
LexQuote()626 AsmToken AsmLexer::LexQuote() {
627   int CurChar = getNextChar();
628   if (LexHLASMStrings)
629     return ReturnError(TokStart, "invalid usage of string literals");
630 
631   if (LexMasmStrings) {
632     while (CurChar != EOF) {
633       if (CurChar != '"') {
634         CurChar = getNextChar();
635       } else if (peekNextChar() == '"') {
636         // In MASM double-quoted strings, doubled double-quotes mean an escaped
637         // double quote, so should be lexed in.
638         (void)getNextChar();
639         CurChar = getNextChar();
640       } else {
641         break;
642       }
643     }
644     if (CurChar == EOF)
645       return ReturnError(TokStart, "unterminated string constant");
646     return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
647   }
648 
649   while (CurChar != '"') {
650     if (CurChar == '\\') {
651       // Allow \", etc.
652       CurChar = getNextChar();
653     }
654 
655     if (CurChar == EOF)
656       return ReturnError(TokStart, "unterminated string constant");
657 
658     CurChar = getNextChar();
659   }
660 
661   return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
662 }
663 
LexUntilEndOfStatement()664 StringRef AsmLexer::LexUntilEndOfStatement() {
665   TokStart = CurPtr;
666 
667   while (!isAtStartOfComment(CurPtr) &&     // Start of line comment.
668          !isAtStatementSeparator(CurPtr) && // End of statement marker.
669          *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
670     ++CurPtr;
671   }
672   return StringRef(TokStart, CurPtr-TokStart);
673 }
674 
LexUntilEndOfLine()675 StringRef AsmLexer::LexUntilEndOfLine() {
676   TokStart = CurPtr;
677 
678   while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
679     ++CurPtr;
680   }
681   return StringRef(TokStart, CurPtr-TokStart);
682 }
683 
peekTokens(MutableArrayRef<AsmToken> Buf,bool ShouldSkipSpace)684 size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
685                             bool ShouldSkipSpace) {
686   SaveAndRestore SavedTokenStart(TokStart);
687   SaveAndRestore SavedCurPtr(CurPtr);
688   SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
689   SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
690   SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
691   SaveAndRestore SavedIsPeeking(IsPeeking, true);
692   std::string SavedErr = getErr();
693   SMLoc SavedErrLoc = getErrLoc();
694 
695   size_t ReadCount;
696   for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
697     AsmToken Token = LexToken();
698 
699     Buf[ReadCount] = Token;
700 
701     if (Token.is(AsmToken::Eof))
702       break;
703   }
704 
705   SetError(SavedErrLoc, SavedErr);
706   return ReadCount;
707 }
708 
isAtStartOfComment(const char * Ptr)709 bool AsmLexer::isAtStartOfComment(const char *Ptr) {
710   if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement)
711     return false;
712 
713   StringRef CommentString = MAI.getCommentString();
714 
715   if (CommentString.size() == 1)
716     return CommentString[0] == Ptr[0];
717 
718   // Allow # preprocessor comments also be counted as comments for "##" cases
719   if (CommentString[1] == '#')
720     return CommentString[0] == Ptr[0];
721 
722   return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0;
723 }
724 
isAtStatementSeparator(const char * Ptr)725 bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
726   return strncmp(Ptr, MAI.getSeparatorString(),
727                  strlen(MAI.getSeparatorString())) == 0;
728 }
729 
LexToken()730 AsmToken AsmLexer::LexToken() {
731   TokStart = CurPtr;
732   // This always consumes at least one character.
733   int CurChar = getNextChar();
734 
735   if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
736     // If this starts with a '#', this may be a cpp
737     // hash directive and otherwise a line comment.
738     AsmToken TokenBuf[2];
739     MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
740     size_t num = peekTokens(Buf, true);
741     // There cannot be a space preceding this
742     if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
743         TokenBuf[1].is(AsmToken::String)) {
744       CurPtr = TokStart; // reset curPtr;
745       StringRef s = LexUntilEndOfLine();
746       UnLex(TokenBuf[1]);
747       UnLex(TokenBuf[0]);
748       return AsmToken(AsmToken::HashDirective, s);
749     }
750 
751     if (MAI.shouldAllowAdditionalComments())
752       return LexLineComment();
753   }
754 
755   if (isAtStartOfComment(TokStart))
756     return LexLineComment();
757 
758   if (isAtStatementSeparator(TokStart)) {
759     CurPtr += strlen(MAI.getSeparatorString()) - 1;
760     IsAtStartOfLine = true;
761     IsAtStartOfStatement = true;
762     return AsmToken(AsmToken::EndOfStatement,
763                     StringRef(TokStart, strlen(MAI.getSeparatorString())));
764   }
765 
766   // If we're missing a newline at EOF, make sure we still get an
767   // EndOfStatement token before the Eof token.
768   if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
769     IsAtStartOfLine = true;
770     IsAtStartOfStatement = true;
771     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
772   }
773   IsAtStartOfLine = false;
774   bool OldIsAtStartOfStatement = IsAtStartOfStatement;
775   IsAtStartOfStatement = false;
776   switch (CurChar) {
777   default:
778     // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
779     // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
780     // an identifier is target-dependent. These characters are handled in the
781     // respective switch cases.
782     if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
783       return LexIdentifier();
784 
785     // Unknown character, emit an error.
786     return ReturnError(TokStart, "invalid character in input");
787   case EOF:
788     if (EndStatementAtEOF) {
789       IsAtStartOfLine = true;
790       IsAtStartOfStatement = true;
791     }
792     return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
793   case 0:
794   case ' ':
795   case '\t':
796     IsAtStartOfStatement = OldIsAtStartOfStatement;
797     while (*CurPtr == ' ' || *CurPtr == '\t')
798       CurPtr++;
799     if (SkipSpace)
800       return LexToken(); // Ignore whitespace.
801     else
802       return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
803   case '\r': {
804     IsAtStartOfLine = true;
805     IsAtStartOfStatement = true;
806     // If this is a CR followed by LF, treat that as one token.
807     if (CurPtr != CurBuf.end() && *CurPtr == '\n')
808       ++CurPtr;
809     return AsmToken(AsmToken::EndOfStatement,
810                     StringRef(TokStart, CurPtr - TokStart));
811   }
812   case '\n':
813     IsAtStartOfLine = true;
814     IsAtStartOfStatement = true;
815     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
816   case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
817   case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
818   case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
819   case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
820   case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
821   case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
822   case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
823   case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
824   case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
825   case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
826   case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
827   case '$': {
828     if (LexMotorolaIntegers && isHexDigit(*CurPtr))
829       return LexDigit();
830     if (MAI.doesAllowDollarAtStartOfIdentifier())
831       return LexIdentifier();
832     return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
833   }
834   case '@':
835     if (MAI.doesAllowAtAtStartOfIdentifier())
836       return LexIdentifier();
837     return AsmToken(AsmToken::At, StringRef(TokStart, 1));
838   case '#':
839     if (MAI.doesAllowHashAtStartOfIdentifier())
840       return LexIdentifier();
841     return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
842   case '?':
843     if (MAI.doesAllowQuestionAtStartOfIdentifier())
844       return LexIdentifier();
845     return AsmToken(AsmToken::Question, StringRef(TokStart, 1));
846   case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
847   case '=':
848     if (*CurPtr == '=') {
849       ++CurPtr;
850       return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
851     }
852     return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
853   case '-':
854     if (*CurPtr == '>') {
855       ++CurPtr;
856       return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
857     }
858     return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
859   case '|':
860     if (*CurPtr == '|') {
861       ++CurPtr;
862       return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
863     }
864     return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
865   case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
866   case '&':
867     if (*CurPtr == '&') {
868       ++CurPtr;
869       return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
870     }
871     return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
872   case '!':
873     if (*CurPtr == '=') {
874       ++CurPtr;
875       return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
876     }
877     return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
878   case '%':
879     if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
880       return LexDigit();
881     }
882 
883     if (MAI.hasMipsExpressions()) {
884       AsmToken::TokenKind Operator;
885       unsigned OperatorLength;
886 
887       std::tie(Operator, OperatorLength) =
888           StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>(
889               StringRef(CurPtr))
890               .StartsWith("call16", {AsmToken::PercentCall16, 7})
891               .StartsWith("call_hi", {AsmToken::PercentCall_Hi, 8})
892               .StartsWith("call_lo", {AsmToken::PercentCall_Lo, 8})
893               .StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi, 10})
894               .StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo, 10})
895               .StartsWith("got_disp", {AsmToken::PercentGot_Disp, 9})
896               .StartsWith("got_hi", {AsmToken::PercentGot_Hi, 7})
897               .StartsWith("got_lo", {AsmToken::PercentGot_Lo, 7})
898               .StartsWith("got_ofst", {AsmToken::PercentGot_Ofst, 9})
899               .StartsWith("got_page", {AsmToken::PercentGot_Page, 9})
900               .StartsWith("gottprel", {AsmToken::PercentGottprel, 9})
901               .StartsWith("got", {AsmToken::PercentGot, 4})
902               .StartsWith("gp_rel", {AsmToken::PercentGp_Rel, 7})
903               .StartsWith("higher", {AsmToken::PercentHigher, 7})
904               .StartsWith("highest", {AsmToken::PercentHighest, 8})
905               .StartsWith("hi", {AsmToken::PercentHi, 3})
906               .StartsWith("lo", {AsmToken::PercentLo, 3})
907               .StartsWith("neg", {AsmToken::PercentNeg, 4})
908               .StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi, 9})
909               .StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo, 9})
910               .StartsWith("tlsgd", {AsmToken::PercentTlsgd, 6})
911               .StartsWith("tlsldm", {AsmToken::PercentTlsldm, 7})
912               .StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi, 9})
913               .StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo, 9})
914               .Default({AsmToken::Percent, 1});
915 
916       if (Operator != AsmToken::Percent) {
917         CurPtr += OperatorLength - 1;
918         return AsmToken(Operator, StringRef(TokStart, OperatorLength));
919       }
920     }
921     return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
922   case '/':
923     IsAtStartOfStatement = OldIsAtStartOfStatement;
924     return LexSlash();
925   case '\'': return LexSingleQuote();
926   case '"': return LexQuote();
927   case '0': case '1': case '2': case '3': case '4':
928   case '5': case '6': case '7': case '8': case '9':
929     return LexDigit();
930   case '<':
931     switch (*CurPtr) {
932     case '<':
933       ++CurPtr;
934       return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
935     case '=':
936       ++CurPtr;
937       return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
938     case '>':
939       ++CurPtr;
940       return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
941     default:
942       return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
943     }
944   case '>':
945     switch (*CurPtr) {
946     case '>':
947       ++CurPtr;
948       return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
949     case '=':
950       ++CurPtr;
951       return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
952     default:
953       return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
954     }
955 
956   // TODO: Quoted identifiers (objc methods etc)
957   // local labels: [0-9][:]
958   // Forward/backward labels: [0-9][fb]
959   // Integers, fp constants, character constants.
960   }
961 }
962