xref: /freebsd/contrib/llvm-project/llvm/lib/MC/MCParser/AsmLexer.cpp (revision f126d349810fdb512c0b01e101342d430b947488)
1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class implements the lexer for assembly files.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/MC/MCParser/AsmLexer.h"
14 #include "llvm/ADT/APInt.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/ADT/StringSwitch.h"
19 #include "llvm/MC/MCAsmInfo.h"
20 #include "llvm/MC/MCParser/MCAsmLexer.h"
21 #include "llvm/Support/Compiler.h"
22 #include "llvm/Support/SMLoc.h"
23 #include "llvm/Support/SaveAndRestore.h"
24 #include <cassert>
25 #include <cctype>
26 #include <cstdio>
27 #include <cstring>
28 #include <string>
29 #include <tuple>
30 #include <utility>
31 
32 using namespace llvm;
33 
34 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
35   AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
36   LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
37 }
38 
39 AsmLexer::~AsmLexer() = default;
40 
41 void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
42                          bool EndStatementAtEOF) {
43   CurBuf = Buf;
44 
45   if (ptr)
46     CurPtr = ptr;
47   else
48     CurPtr = CurBuf.begin();
49 
50   TokStart = nullptr;
51   this->EndStatementAtEOF = EndStatementAtEOF;
52 }
53 
54 /// ReturnError - Set the error to the specified string at the specified
55 /// location.  This is defined to always return AsmToken::Error.
56 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
57   SetError(SMLoc::getFromPointer(Loc), Msg);
58 
59   return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
60 }
61 
62 int AsmLexer::getNextChar() {
63   if (CurPtr == CurBuf.end())
64     return EOF;
65   return (unsigned char)*CurPtr++;
66 }
67 
68 int AsmLexer::peekNextChar() {
69   if (CurPtr == CurBuf.end())
70     return EOF;
71   return (unsigned char)*CurPtr;
72 }
73 
74 /// The leading integral digit sequence and dot should have already been
75 /// consumed, some or all of the fractional digit sequence *can* have been
76 /// consumed.
77 AsmToken AsmLexer::LexFloatLiteral() {
78   // Skip the fractional digit sequence.
79   while (isDigit(*CurPtr))
80     ++CurPtr;
81 
82   if (*CurPtr == '-' || *CurPtr == '+')
83     return ReturnError(CurPtr, "invalid sign in float literal");
84 
85   // Check for exponent
86   if ((*CurPtr == 'e' || *CurPtr == 'E')) {
87     ++CurPtr;
88 
89     if (*CurPtr == '-' || *CurPtr == '+')
90       ++CurPtr;
91 
92     while (isDigit(*CurPtr))
93       ++CurPtr;
94   }
95 
96   return AsmToken(AsmToken::Real,
97                   StringRef(TokStart, CurPtr - TokStart));
98 }
99 
100 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
101 /// while making sure there are enough actual digits around for the constant to
102 /// be valid.
103 ///
104 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
105 /// before we get here.
106 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
107   assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
108          "unexpected parse state in floating hex");
109   bool NoFracDigits = true;
110 
111   // Skip the fractional part if there is one
112   if (*CurPtr == '.') {
113     ++CurPtr;
114 
115     const char *FracStart = CurPtr;
116     while (isHexDigit(*CurPtr))
117       ++CurPtr;
118 
119     NoFracDigits = CurPtr == FracStart;
120   }
121 
122   if (NoIntDigits && NoFracDigits)
123     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
124                                  "expected at least one significand digit");
125 
126   // Make sure we do have some kind of proper exponent part
127   if (*CurPtr != 'p' && *CurPtr != 'P')
128     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
129                                  "expected exponent part 'p'");
130   ++CurPtr;
131 
132   if (*CurPtr == '+' || *CurPtr == '-')
133     ++CurPtr;
134 
135   // N.b. exponent digits are *not* hex
136   const char *ExpStart = CurPtr;
137   while (isDigit(*CurPtr))
138     ++CurPtr;
139 
140   if (CurPtr == ExpStart)
141     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
142                                  "expected at least one exponent digit");
143 
144   return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
145 }
146 
147 /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
148 static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
149   return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
150          (AllowAt && C == '@') || (AllowHash && C == '#');
151 }
152 
153 AsmToken AsmLexer::LexIdentifier() {
154   // Check for floating point literals.
155   if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
156     // Disambiguate a .1243foo identifier from a floating literal.
157     while (isDigit(*CurPtr))
158       ++CurPtr;
159 
160     if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
161                           AllowHashInIdentifier) ||
162         *CurPtr == 'e' || *CurPtr == 'E')
163       return LexFloatLiteral();
164   }
165 
166   while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
167     ++CurPtr;
168 
169   // Handle . as a special case.
170   if (CurPtr == TokStart+1 && TokStart[0] == '.')
171     return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
172 
173   return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
174 }
175 
176 /// LexSlash: Slash: /
177 ///           C-Style Comment: /* ... */
178 ///           C-style Comment: // ...
179 AsmToken AsmLexer::LexSlash() {
180   if (!MAI.shouldAllowAdditionalComments()) {
181     IsAtStartOfStatement = false;
182     return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
183   }
184 
185   switch (*CurPtr) {
186   case '*':
187     IsAtStartOfStatement = false;
188     break; // C style comment.
189   case '/':
190     ++CurPtr;
191     return LexLineComment();
192   default:
193     IsAtStartOfStatement = false;
194     return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
195   }
196 
197   // C Style comment.
198   ++CurPtr;  // skip the star.
199   const char *CommentTextStart = CurPtr;
200   while (CurPtr != CurBuf.end()) {
201     switch (*CurPtr++) {
202     case '*':
203       // End of the comment?
204       if (*CurPtr != '/')
205         break;
206       // If we have a CommentConsumer, notify it about the comment.
207       if (CommentConsumer) {
208         CommentConsumer->HandleComment(
209             SMLoc::getFromPointer(CommentTextStart),
210             StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
211       }
212       ++CurPtr;   // End the */.
213       return AsmToken(AsmToken::Comment,
214                       StringRef(TokStart, CurPtr - TokStart));
215     }
216   }
217   return ReturnError(TokStart, "unterminated comment");
218 }
219 
220 /// LexLineComment: Comment: #[^\n]*
221 ///                        : //[^\n]*
222 AsmToken AsmLexer::LexLineComment() {
223   // Mark This as an end of statement with a body of the
224   // comment. While it would be nicer to leave this two tokens,
225   // backwards compatability with TargetParsers makes keeping this in this form
226   // better.
227   const char *CommentTextStart = CurPtr;
228   int CurChar = getNextChar();
229   while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
230     CurChar = getNextChar();
231   const char *NewlinePtr = CurPtr;
232   if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
233     ++CurPtr;
234 
235   // If we have a CommentConsumer, notify it about the comment.
236   if (CommentConsumer) {
237     CommentConsumer->HandleComment(
238         SMLoc::getFromPointer(CommentTextStart),
239         StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
240   }
241 
242   IsAtStartOfLine = true;
243   // This is a whole line comment. leave newline
244   if (IsAtStartOfStatement)
245     return AsmToken(AsmToken::EndOfStatement,
246                     StringRef(TokStart, CurPtr - TokStart));
247   IsAtStartOfStatement = true;
248 
249   return AsmToken(AsmToken::EndOfStatement,
250                   StringRef(TokStart, CurPtr - 1 - TokStart));
251 }
252 
253 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
254   // Skip ULL, UL, U, L and LL suffices.
255   if (CurPtr[0] == 'U')
256     ++CurPtr;
257   if (CurPtr[0] == 'L')
258     ++CurPtr;
259   if (CurPtr[0] == 'L')
260     ++CurPtr;
261 }
262 
263 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
264 // integer as a hexadecimal, possibly with leading zeroes.
265 static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
266                                bool LexHex) {
267   const char *FirstNonDec = nullptr;
268   const char *LookAhead = CurPtr;
269   while (true) {
270     if (isDigit(*LookAhead)) {
271       ++LookAhead;
272     } else {
273       if (!FirstNonDec)
274         FirstNonDec = LookAhead;
275 
276       // Keep going if we are looking for a 'h' suffix.
277       if (LexHex && isHexDigit(*LookAhead))
278         ++LookAhead;
279       else
280         break;
281     }
282   }
283   bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
284   CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
285   if (isHex)
286     return 16;
287   return DefaultRadix;
288 }
289 
290 static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
291   while (hexDigitValue(*CurPtr) < DefaultRadix) {
292     ++CurPtr;
293   }
294   return CurPtr;
295 }
296 
297 static AsmToken intToken(StringRef Ref, APInt &Value) {
298   if (Value.isIntN(64))
299     return AsmToken(AsmToken::Integer, Ref, Value);
300   return AsmToken(AsmToken::BigNum, Ref, Value);
301 }
302 
303 static std::string radixName(unsigned Radix) {
304   switch (Radix) {
305   case 2:
306     return "binary";
307   case 8:
308     return "octal";
309   case 10:
310     return "decimal";
311   case 16:
312     return "hexadecimal";
313   default:
314     return "base-" + std::to_string(Radix);
315   }
316 }
317 
318 /// LexDigit: First character is [0-9].
319 ///   Local Label: [0-9][:]
320 ///   Forward/Backward Label: [0-9][fb]
321 ///   Binary integer: 0b[01]+
322 ///   Octal integer: 0[0-7]+
323 ///   Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
324 ///   Decimal integer: [1-9][0-9]*
325 AsmToken AsmLexer::LexDigit() {
326   // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
327   // MASM-flavor octal integer: [0-7]+[oOqQ]
328   // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
329   // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
330   if (LexMasmIntegers && isdigit(CurPtr[-1])) {
331     const char *FirstNonBinary =
332         (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
333     const char *FirstNonDecimal =
334         (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
335     const char *OldCurPtr = CurPtr;
336     while (isHexDigit(*CurPtr)) {
337       switch (*CurPtr) {
338       default:
339         if (!FirstNonDecimal) {
340           FirstNonDecimal = CurPtr;
341         }
342         LLVM_FALLTHROUGH;
343       case '9':
344       case '8':
345       case '7':
346       case '6':
347       case '5':
348       case '4':
349       case '3':
350       case '2':
351         if (!FirstNonBinary) {
352           FirstNonBinary = CurPtr;
353         }
354         break;
355       case '1':
356       case '0':
357         break;
358       }
359       ++CurPtr;
360     }
361     if (*CurPtr == '.') {
362       // MASM float literals (other than hex floats) always contain a ".", and
363       // are always written in decimal.
364       ++CurPtr;
365       return LexFloatLiteral();
366     }
367 
368     if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
369       ++CurPtr;
370       return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
371     }
372 
373     unsigned Radix = 0;
374     if (*CurPtr == 'h' || *CurPtr == 'H') {
375       // hexadecimal number
376       ++CurPtr;
377       Radix = 16;
378     } else if (*CurPtr == 't' || *CurPtr == 'T') {
379       // decimal number
380       ++CurPtr;
381       Radix = 10;
382     } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
383                *CurPtr == 'Q') {
384       // octal number
385       ++CurPtr;
386       Radix = 8;
387     } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
388       // binary number
389       ++CurPtr;
390       Radix = 2;
391     } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
392                DefaultRadix < 14 &&
393                (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
394       Radix = 10;
395     } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
396                DefaultRadix < 12 &&
397                (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
398       Radix = 2;
399     }
400 
401     if (Radix) {
402       StringRef Result(TokStart, CurPtr - TokStart);
403       APInt Value(128, 0, true);
404 
405       if (Result.drop_back().getAsInteger(Radix, Value))
406         return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
407 
408       // MSVC accepts and ignores type suffices on integer literals.
409       SkipIgnoredIntegerSuffix(CurPtr);
410 
411       return intToken(Result, Value);
412     }
413 
414     // default-radix integers, or floating point numbers, fall through
415     CurPtr = OldCurPtr;
416   }
417 
418   // MASM default-radix integers: [0-9a-fA-F]+
419   // (All other integer literals have a radix specifier.)
420   if (LexMasmIntegers && UseMasmDefaultRadix) {
421     CurPtr = findLastDigit(CurPtr, 16);
422     StringRef Result(TokStart, CurPtr - TokStart);
423 
424     APInt Value(128, 0, true);
425     if (Result.getAsInteger(DefaultRadix, Value)) {
426       return ReturnError(TokStart,
427                          "invalid " + radixName(DefaultRadix) + " number");
428     }
429 
430     return intToken(Result, Value);
431   }
432 
433   // Motorola hex integers: $[0-9a-fA-F]+
434   if (LexMotorolaIntegers && CurPtr[-1] == '$') {
435     const char *NumStart = CurPtr;
436     while (isHexDigit(CurPtr[0]))
437       ++CurPtr;
438 
439     APInt Result(128, 0);
440     if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result))
441       return ReturnError(TokStart, "invalid hexadecimal number");
442 
443     return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
444   }
445 
446   // Motorola binary integers: %[01]+
447   if (LexMotorolaIntegers && CurPtr[-1] == '%') {
448     const char *NumStart = CurPtr;
449     while (*CurPtr == '0' || *CurPtr == '1')
450       ++CurPtr;
451 
452     APInt Result(128, 0);
453     if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result))
454       return ReturnError(TokStart, "invalid binary number");
455 
456     return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
457   }
458 
459   // Decimal integer: [1-9][0-9]*
460   // HLASM-flavour decimal integer: [0-9][0-9]*
461   // FIXME: Later on, support for fb for HLASM has to be added in
462   // as they probably would be needed for asm goto
463   if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
464     unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
465 
466     if (!LexHLASMIntegers) {
467       bool IsHex = Radix == 16;
468       // Check for floating point literals.
469       if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
470         if (*CurPtr == '.')
471           ++CurPtr;
472         return LexFloatLiteral();
473       }
474     }
475 
476     StringRef Result(TokStart, CurPtr - TokStart);
477 
478     APInt Value(128, 0, true);
479     if (Result.getAsInteger(Radix, Value))
480       return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
481 
482     if (!LexHLASMIntegers)
483       // The darwin/x86 (and x86-64) assembler accepts and ignores type
484       // suffices on integer literals.
485       SkipIgnoredIntegerSuffix(CurPtr);
486 
487     return intToken(Result, Value);
488   }
489 
490   if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
491     ++CurPtr;
492     // See if we actually have "0b" as part of something like "jmp 0b\n"
493     if (!isDigit(CurPtr[0])) {
494       --CurPtr;
495       StringRef Result(TokStart, CurPtr - TokStart);
496       return AsmToken(AsmToken::Integer, Result, 0);
497     }
498     const char *NumStart = CurPtr;
499     while (CurPtr[0] == '0' || CurPtr[0] == '1')
500       ++CurPtr;
501 
502     // Requires at least one binary digit.
503     if (CurPtr == NumStart)
504       return ReturnError(TokStart, "invalid binary number");
505 
506     StringRef Result(TokStart, CurPtr - TokStart);
507 
508     APInt Value(128, 0, true);
509     if (Result.substr(2).getAsInteger(2, Value))
510       return ReturnError(TokStart, "invalid binary number");
511 
512     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
513     // suffixes on integer literals.
514     SkipIgnoredIntegerSuffix(CurPtr);
515 
516     return intToken(Result, Value);
517   }
518 
519   if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
520     ++CurPtr;
521     const char *NumStart = CurPtr;
522     while (isHexDigit(CurPtr[0]))
523       ++CurPtr;
524 
525     // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
526     // diagnosed by LexHexFloatLiteral).
527     if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
528       return LexHexFloatLiteral(NumStart == CurPtr);
529 
530     // Otherwise requires at least one hex digit.
531     if (CurPtr == NumStart)
532       return ReturnError(CurPtr-2, "invalid hexadecimal number");
533 
534     APInt Result(128, 0);
535     if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
536       return ReturnError(TokStart, "invalid hexadecimal number");
537 
538     // Consume the optional [hH].
539     if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
540       ++CurPtr;
541 
542     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
543     // suffixes on integer literals.
544     SkipIgnoredIntegerSuffix(CurPtr);
545 
546     return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
547   }
548 
549   // Either octal or hexadecimal.
550   APInt Value(128, 0, true);
551   unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
552   StringRef Result(TokStart, CurPtr - TokStart);
553   if (Result.getAsInteger(Radix, Value))
554     return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
555 
556   // Consume the [hH].
557   if (Radix == 16)
558     ++CurPtr;
559 
560   // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
561   // suffixes on integer literals.
562   SkipIgnoredIntegerSuffix(CurPtr);
563 
564   return intToken(Result, Value);
565 }
566 
567 /// LexSingleQuote: Integer: 'b'
568 AsmToken AsmLexer::LexSingleQuote() {
569   int CurChar = getNextChar();
570 
571   if (LexHLASMStrings)
572     return ReturnError(TokStart, "invalid usage of character literals");
573 
574   if (LexMasmStrings) {
575     while (CurChar != EOF) {
576       if (CurChar != '\'') {
577         CurChar = getNextChar();
578       } else if (peekNextChar() == '\'') {
579         // In MASM single-quote strings, doubled single-quotes mean an escaped
580         // single quote, so should be lexed in.
581         getNextChar();
582         CurChar = getNextChar();
583       } else {
584         break;
585       }
586     }
587     if (CurChar == EOF)
588       return ReturnError(TokStart, "unterminated string constant");
589     return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
590   }
591 
592   if (CurChar == '\\')
593     CurChar = getNextChar();
594 
595   if (CurChar == EOF)
596     return ReturnError(TokStart, "unterminated single quote");
597 
598   CurChar = getNextChar();
599 
600   if (CurChar != '\'')
601     return ReturnError(TokStart, "single quote way too long");
602 
603   // The idea here being that 'c' is basically just an integral
604   // constant.
605   StringRef Res = StringRef(TokStart,CurPtr - TokStart);
606   long long Value;
607 
608   if (Res.startswith("\'\\")) {
609     char theChar = Res[2];
610     switch (theChar) {
611       default: Value = theChar; break;
612       case '\'': Value = '\''; break;
613       case 't': Value = '\t'; break;
614       case 'n': Value = '\n'; break;
615       case 'b': Value = '\b'; break;
616       case 'f': Value = '\f'; break;
617       case 'r': Value = '\r'; break;
618     }
619   } else
620     Value = TokStart[1];
621 
622   return AsmToken(AsmToken::Integer, Res, Value);
623 }
624 
625 /// LexQuote: String: "..."
626 AsmToken AsmLexer::LexQuote() {
627   int CurChar = getNextChar();
628   if (LexHLASMStrings)
629     return ReturnError(TokStart, "invalid usage of string literals");
630 
631   if (LexMasmStrings) {
632     while (CurChar != EOF) {
633       if (CurChar != '"') {
634         CurChar = getNextChar();
635       } else if (peekNextChar() == '"') {
636         // In MASM double-quoted strings, doubled double-quotes mean an escaped
637         // double quote, so should be lexed in.
638         getNextChar();
639         CurChar = getNextChar();
640       } else {
641         break;
642       }
643     }
644     if (CurChar == EOF)
645       return ReturnError(TokStart, "unterminated string constant");
646     return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
647   }
648 
649   // TODO: does gas allow multiline string constants?
650   while (CurChar != '"') {
651     if (CurChar == '\\') {
652       // Allow \", etc.
653       CurChar = getNextChar();
654     }
655 
656     if (CurChar == EOF)
657       return ReturnError(TokStart, "unterminated string constant");
658 
659     CurChar = getNextChar();
660   }
661 
662   return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
663 }
664 
665 StringRef AsmLexer::LexUntilEndOfStatement() {
666   TokStart = CurPtr;
667 
668   while (!isAtStartOfComment(CurPtr) &&     // Start of line comment.
669          !isAtStatementSeparator(CurPtr) && // End of statement marker.
670          *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
671     ++CurPtr;
672   }
673   return StringRef(TokStart, CurPtr-TokStart);
674 }
675 
676 StringRef AsmLexer::LexUntilEndOfLine() {
677   TokStart = CurPtr;
678 
679   while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
680     ++CurPtr;
681   }
682   return StringRef(TokStart, CurPtr-TokStart);
683 }
684 
685 size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
686                             bool ShouldSkipSpace) {
687   SaveAndRestore<const char *> SavedTokenStart(TokStart);
688   SaveAndRestore<const char *> SavedCurPtr(CurPtr);
689   SaveAndRestore<bool> SavedAtStartOfLine(IsAtStartOfLine);
690   SaveAndRestore<bool> SavedAtStartOfStatement(IsAtStartOfStatement);
691   SaveAndRestore<bool> SavedSkipSpace(SkipSpace, ShouldSkipSpace);
692   SaveAndRestore<bool> SavedIsPeeking(IsPeeking, true);
693   std::string SavedErr = getErr();
694   SMLoc SavedErrLoc = getErrLoc();
695 
696   size_t ReadCount;
697   for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
698     AsmToken Token = LexToken();
699 
700     Buf[ReadCount] = Token;
701 
702     if (Token.is(AsmToken::Eof))
703       break;
704   }
705 
706   SetError(SavedErrLoc, SavedErr);
707   return ReadCount;
708 }
709 
710 bool AsmLexer::isAtStartOfComment(const char *Ptr) {
711   if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement)
712     return false;
713 
714   StringRef CommentString = MAI.getCommentString();
715 
716   if (CommentString.size() == 1)
717     return CommentString[0] == Ptr[0];
718 
719   // Allow # preprocessor commments also be counted as comments for "##" cases
720   if (CommentString[1] == '#')
721     return CommentString[0] == Ptr[0];
722 
723   return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0;
724 }
725 
726 bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
727   return strncmp(Ptr, MAI.getSeparatorString(),
728                  strlen(MAI.getSeparatorString())) == 0;
729 }
730 
731 AsmToken AsmLexer::LexToken() {
732   TokStart = CurPtr;
733   // This always consumes at least one character.
734   int CurChar = getNextChar();
735 
736   if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
737     // If this starts with a '#', this may be a cpp
738     // hash directive and otherwise a line comment.
739     AsmToken TokenBuf[2];
740     MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
741     size_t num = peekTokens(Buf, true);
742     // There cannot be a space preceding this
743     if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
744         TokenBuf[1].is(AsmToken::String)) {
745       CurPtr = TokStart; // reset curPtr;
746       StringRef s = LexUntilEndOfLine();
747       UnLex(TokenBuf[1]);
748       UnLex(TokenBuf[0]);
749       return AsmToken(AsmToken::HashDirective, s);
750     }
751 
752     if (MAI.shouldAllowAdditionalComments())
753       return LexLineComment();
754   }
755 
756   if (isAtStartOfComment(TokStart))
757     return LexLineComment();
758 
759   if (isAtStatementSeparator(TokStart)) {
760     CurPtr += strlen(MAI.getSeparatorString()) - 1;
761     IsAtStartOfLine = true;
762     IsAtStartOfStatement = true;
763     return AsmToken(AsmToken::EndOfStatement,
764                     StringRef(TokStart, strlen(MAI.getSeparatorString())));
765   }
766 
767   // If we're missing a newline at EOF, make sure we still get an
768   // EndOfStatement token before the Eof token.
769   if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
770     IsAtStartOfLine = true;
771     IsAtStartOfStatement = true;
772     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
773   }
774   IsAtStartOfLine = false;
775   bool OldIsAtStartOfStatement = IsAtStartOfStatement;
776   IsAtStartOfStatement = false;
777   switch (CurChar) {
778   default:
779     // Handle identifier: [a-zA-Z_.?][a-zA-Z0-9_$.@#?]*
780     if (isalpha(CurChar) || CurChar == '_' || CurChar == '.' ||
781         (MAI.doesAllowQuestionAtStartOfIdentifier() && CurChar == '?'))
782       return LexIdentifier();
783 
784     // Unknown character, emit an error.
785     return ReturnError(TokStart, "invalid character in input");
786   case EOF:
787     if (EndStatementAtEOF) {
788       IsAtStartOfLine = true;
789       IsAtStartOfStatement = true;
790     }
791     return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
792   case 0:
793   case ' ':
794   case '\t':
795     IsAtStartOfStatement = OldIsAtStartOfStatement;
796     while (*CurPtr == ' ' || *CurPtr == '\t')
797       CurPtr++;
798     if (SkipSpace)
799       return LexToken(); // Ignore whitespace.
800     else
801       return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
802   case '\r': {
803     IsAtStartOfLine = true;
804     IsAtStartOfStatement = true;
805     // If this is a CR followed by LF, treat that as one token.
806     if (CurPtr != CurBuf.end() && *CurPtr == '\n')
807       ++CurPtr;
808     return AsmToken(AsmToken::EndOfStatement,
809                     StringRef(TokStart, CurPtr - TokStart));
810   }
811   case '\n':
812     IsAtStartOfLine = true;
813     IsAtStartOfStatement = true;
814     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
815   case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
816   case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
817   case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
818   case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
819   case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
820   case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
821   case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
822   case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
823   case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
824   case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
825   case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
826   case '$': {
827     if (LexMotorolaIntegers && isHexDigit(*CurPtr))
828       return LexDigit();
829     if (MAI.doesAllowDollarAtStartOfIdentifier())
830       return LexIdentifier();
831     return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
832   }
833   case '@': {
834     if (MAI.doesAllowAtAtStartOfIdentifier())
835       return LexIdentifier();
836     return AsmToken(AsmToken::At, StringRef(TokStart, 1));
837   }
838   case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
839   case '=':
840     if (*CurPtr == '=') {
841       ++CurPtr;
842       return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
843     }
844     return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
845   case '-':
846     if (*CurPtr == '>') {
847       ++CurPtr;
848       return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
849     }
850     return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
851   case '|':
852     if (*CurPtr == '|') {
853       ++CurPtr;
854       return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
855     }
856     return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
857   case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
858   case '&':
859     if (*CurPtr == '&') {
860       ++CurPtr;
861       return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
862     }
863     return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
864   case '!':
865     if (*CurPtr == '=') {
866       ++CurPtr;
867       return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
868     }
869     return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
870   case '%':
871     if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
872       return LexDigit();
873     }
874 
875     if (MAI.hasMipsExpressions()) {
876       AsmToken::TokenKind Operator;
877       unsigned OperatorLength;
878 
879       std::tie(Operator, OperatorLength) =
880           StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>(
881               StringRef(CurPtr))
882               .StartsWith("call16", {AsmToken::PercentCall16, 7})
883               .StartsWith("call_hi", {AsmToken::PercentCall_Hi, 8})
884               .StartsWith("call_lo", {AsmToken::PercentCall_Lo, 8})
885               .StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi, 10})
886               .StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo, 10})
887               .StartsWith("got_disp", {AsmToken::PercentGot_Disp, 9})
888               .StartsWith("got_hi", {AsmToken::PercentGot_Hi, 7})
889               .StartsWith("got_lo", {AsmToken::PercentGot_Lo, 7})
890               .StartsWith("got_ofst", {AsmToken::PercentGot_Ofst, 9})
891               .StartsWith("got_page", {AsmToken::PercentGot_Page, 9})
892               .StartsWith("gottprel", {AsmToken::PercentGottprel, 9})
893               .StartsWith("got", {AsmToken::PercentGot, 4})
894               .StartsWith("gp_rel", {AsmToken::PercentGp_Rel, 7})
895               .StartsWith("higher", {AsmToken::PercentHigher, 7})
896               .StartsWith("highest", {AsmToken::PercentHighest, 8})
897               .StartsWith("hi", {AsmToken::PercentHi, 3})
898               .StartsWith("lo", {AsmToken::PercentLo, 3})
899               .StartsWith("neg", {AsmToken::PercentNeg, 4})
900               .StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi, 9})
901               .StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo, 9})
902               .StartsWith("tlsgd", {AsmToken::PercentTlsgd, 6})
903               .StartsWith("tlsldm", {AsmToken::PercentTlsldm, 7})
904               .StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi, 9})
905               .StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo, 9})
906               .Default({AsmToken::Percent, 1});
907 
908       if (Operator != AsmToken::Percent) {
909         CurPtr += OperatorLength - 1;
910         return AsmToken(Operator, StringRef(TokStart, OperatorLength));
911       }
912     }
913     return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
914   case '/':
915     IsAtStartOfStatement = OldIsAtStartOfStatement;
916     return LexSlash();
917   case '#': {
918     if (MAI.doesAllowHashAtStartOfIdentifier())
919       return LexIdentifier();
920     return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
921   }
922   case '\'': return LexSingleQuote();
923   case '"': return LexQuote();
924   case '0': case '1': case '2': case '3': case '4':
925   case '5': case '6': case '7': case '8': case '9':
926     return LexDigit();
927   case '<':
928     switch (*CurPtr) {
929     case '<':
930       ++CurPtr;
931       return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
932     case '=':
933       ++CurPtr;
934       return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
935     case '>':
936       ++CurPtr;
937       return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
938     default:
939       return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
940     }
941   case '>':
942     switch (*CurPtr) {
943     case '>':
944       ++CurPtr;
945       return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
946     case '=':
947       ++CurPtr;
948       return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
949     default:
950       return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
951     }
952 
953   // TODO: Quoted identifiers (objc methods etc)
954   // local labels: [0-9][:]
955   // Forward/backward labels: [0-9][fb]
956   // Integers, fp constants, character constants.
957   }
958 }
959