xref: /freebsd/contrib/llvm-project/llvm/lib/MC/MCParser/AsmLexer.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class implements the lexer for assembly files.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/MC/MCParser/AsmLexer.h"
14 #include "llvm/ADT/APInt.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/MC/MCAsmInfo.h"
19 #include "llvm/MC/MCParser/AsmLexer.h"
20 #include "llvm/Support/Compiler.h"
21 #include "llvm/Support/SMLoc.h"
22 #include "llvm/Support/SaveAndRestore.h"
23 #include "llvm/Support/raw_ostream.h"
24 #include <cassert>
25 #include <cctype>
26 #include <cstdio>
27 #include <cstring>
28 #include <string>
29 
30 using namespace llvm;
31 
getLoc() const32 SMLoc AsmToken::getLoc() const { return SMLoc::getFromPointer(Str.data()); }
33 
getEndLoc() const34 SMLoc AsmToken::getEndLoc() const {
35   return SMLoc::getFromPointer(Str.data() + Str.size());
36 }
37 
getLocRange() const38 SMRange AsmToken::getLocRange() const { return SMRange(getLoc(), getEndLoc()); }
39 
dump(raw_ostream & OS) const40 void AsmToken::dump(raw_ostream &OS) const {
41   switch (Kind) {
42   case AsmToken::Error:
43     OS << "error";
44     break;
45   case AsmToken::Identifier:
46     OS << "identifier: " << getString();
47     break;
48   case AsmToken::Integer:
49     OS << "int: " << getString();
50     break;
51   case AsmToken::Real:
52     OS << "real: " << getString();
53     break;
54   case AsmToken::String:
55     OS << "string: " << getString();
56     break;
57 
58     // clang-format off
59   case AsmToken::Amp:                OS << "Amp"; break;
60   case AsmToken::AmpAmp:             OS << "AmpAmp"; break;
61   case AsmToken::At:                 OS << "At"; break;
62   case AsmToken::BackSlash:          OS << "BackSlash"; break;
63   case AsmToken::BigNum:             OS << "BigNum"; break;
64   case AsmToken::Caret:              OS << "Caret"; break;
65   case AsmToken::Colon:              OS << "Colon"; break;
66   case AsmToken::Comma:              OS << "Comma"; break;
67   case AsmToken::Comment:            OS << "Comment"; break;
68   case AsmToken::Dollar:             OS << "Dollar"; break;
69   case AsmToken::Dot:                OS << "Dot"; break;
70   case AsmToken::EndOfStatement:     OS << "EndOfStatement"; break;
71   case AsmToken::Eof:                OS << "Eof"; break;
72   case AsmToken::Equal:              OS << "Equal"; break;
73   case AsmToken::EqualEqual:         OS << "EqualEqual"; break;
74   case AsmToken::Exclaim:            OS << "Exclaim"; break;
75   case AsmToken::ExclaimEqual:       OS << "ExclaimEqual"; break;
76   case AsmToken::Greater:            OS << "Greater"; break;
77   case AsmToken::GreaterEqual:       OS << "GreaterEqual"; break;
78   case AsmToken::GreaterGreater:     OS << "GreaterGreater"; break;
79   case AsmToken::Hash:               OS << "Hash"; break;
80   case AsmToken::HashDirective:      OS << "HashDirective"; break;
81   case AsmToken::LBrac:              OS << "LBrac"; break;
82   case AsmToken::LCurly:             OS << "LCurly"; break;
83   case AsmToken::LParen:             OS << "LParen"; break;
84   case AsmToken::Less:               OS << "Less"; break;
85   case AsmToken::LessEqual:          OS << "LessEqual"; break;
86   case AsmToken::LessGreater:        OS << "LessGreater"; break;
87   case AsmToken::LessLess:           OS << "LessLess"; break;
88   case AsmToken::Minus:              OS << "Minus"; break;
89   case AsmToken::MinusGreater:       OS << "MinusGreater"; break;
90   case AsmToken::Percent:            OS << "Percent"; break;
91   case AsmToken::Pipe:               OS << "Pipe"; break;
92   case AsmToken::PipePipe:           OS << "PipePipe"; break;
93   case AsmToken::Plus:               OS << "Plus"; break;
94   case AsmToken::Question:           OS << "Question"; break;
95   case AsmToken::RBrac:              OS << "RBrac"; break;
96   case AsmToken::RCurly:             OS << "RCurly"; break;
97   case AsmToken::RParen:             OS << "RParen"; break;
98   case AsmToken::Slash:              OS << "Slash"; break;
99   case AsmToken::Space:              OS << "Space"; break;
100   case AsmToken::Star:               OS << "Star"; break;
101   case AsmToken::Tilde:              OS << "Tilde"; break;
102     // clang-format on
103   }
104 
105   // Print the token string.
106   OS << " (\"";
107   OS.write_escaped(getString());
108   OS << "\")";
109 }
110 
AsmLexer(const MCAsmInfo & MAI)111 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
112   // For COFF targets, this is true, while for ELF targets, it should be false.
113   // Currently, @specifier parsing depends on '@' being included in the token.
114   AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with("@") &&
115                         MAI.useAtForSpecifier();
116   LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
117 
118   CurTok.emplace_back(AsmToken::Space, StringRef());
119 }
120 
setBuffer(StringRef Buf,const char * ptr,bool EndStatementAtEOF)121 void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
122                          bool EndStatementAtEOF) {
123   CurBuf = Buf;
124 
125   if (ptr)
126     CurPtr = ptr;
127   else
128     CurPtr = CurBuf.begin();
129 
130   TokStart = nullptr;
131   this->EndStatementAtEOF = EndStatementAtEOF;
132 }
133 
134 /// ReturnError - Set the error to the specified string at the specified
135 /// location.  This is defined to always return AsmToken::Error.
ReturnError(const char * Loc,const std::string & Msg)136 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
137   SetError(SMLoc::getFromPointer(Loc), Msg);
138 
139   return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
140 }
141 
getNextChar()142 int AsmLexer::getNextChar() {
143   if (CurPtr == CurBuf.end())
144     return EOF;
145   return (unsigned char)*CurPtr++;
146 }
147 
peekNextChar()148 int AsmLexer::peekNextChar() {
149   if (CurPtr == CurBuf.end())
150     return EOF;
151   return (unsigned char)*CurPtr;
152 }
153 
154 /// The leading integral digit sequence and dot should have already been
155 /// consumed, some or all of the fractional digit sequence *can* have been
156 /// consumed.
LexFloatLiteral()157 AsmToken AsmLexer::LexFloatLiteral() {
158   // Skip the fractional digit sequence.
159   while (isDigit(*CurPtr))
160     ++CurPtr;
161 
162   if (*CurPtr == '-' || *CurPtr == '+')
163     return ReturnError(CurPtr, "invalid sign in float literal");
164 
165   // Check for exponent
166   if ((*CurPtr == 'e' || *CurPtr == 'E')) {
167     ++CurPtr;
168 
169     if (*CurPtr == '-' || *CurPtr == '+')
170       ++CurPtr;
171 
172     while (isDigit(*CurPtr))
173       ++CurPtr;
174   }
175 
176   return AsmToken(AsmToken::Real,
177                   StringRef(TokStart, CurPtr - TokStart));
178 }
179 
180 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
181 /// while making sure there are enough actual digits around for the constant to
182 /// be valid.
183 ///
184 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
185 /// before we get here.
LexHexFloatLiteral(bool NoIntDigits)186 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
187   assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
188          "unexpected parse state in floating hex");
189   bool NoFracDigits = true;
190 
191   // Skip the fractional part if there is one
192   if (*CurPtr == '.') {
193     ++CurPtr;
194 
195     const char *FracStart = CurPtr;
196     while (isHexDigit(*CurPtr))
197       ++CurPtr;
198 
199     NoFracDigits = CurPtr == FracStart;
200   }
201 
202   if (NoIntDigits && NoFracDigits)
203     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
204                                  "expected at least one significand digit");
205 
206   // Make sure we do have some kind of proper exponent part
207   if (*CurPtr != 'p' && *CurPtr != 'P')
208     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
209                                  "expected exponent part 'p'");
210   ++CurPtr;
211 
212   if (*CurPtr == '+' || *CurPtr == '-')
213     ++CurPtr;
214 
215   // N.b. exponent digits are *not* hex
216   const char *ExpStart = CurPtr;
217   while (isDigit(*CurPtr))
218     ++CurPtr;
219 
220   if (CurPtr == ExpStart)
221     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
222                                  "expected at least one exponent digit");
223 
224   return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
225 }
226 
227 /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
isIdentifierChar(char C,bool AllowAt,bool AllowHash)228 static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
229   return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
230          (AllowAt && C == '@') || (AllowHash && C == '#');
231 }
232 
LexIdentifier()233 AsmToken AsmLexer::LexIdentifier() {
234   // Check for floating point literals.
235   if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
236     // Disambiguate a .1243foo identifier from a floating literal.
237     while (isDigit(*CurPtr))
238       ++CurPtr;
239 
240     if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
241                           AllowHashInIdentifier) ||
242         *CurPtr == 'e' || *CurPtr == 'E')
243       return LexFloatLiteral();
244   }
245 
246   while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
247     ++CurPtr;
248 
249   // Handle . as a special case.
250   if (CurPtr == TokStart+1 && TokStart[0] == '.')
251     return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
252 
253   return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
254 }
255 
256 /// LexSlash: Slash: /
257 ///           C-Style Comment: /* ... */
258 ///           C-style Comment: // ...
LexSlash()259 AsmToken AsmLexer::LexSlash() {
260   if (!MAI.shouldAllowAdditionalComments()) {
261     IsAtStartOfStatement = false;
262     return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
263   }
264 
265   switch (*CurPtr) {
266   case '*':
267     IsAtStartOfStatement = false;
268     break; // C style comment.
269   case '/':
270     ++CurPtr;
271     return LexLineComment();
272   default:
273     IsAtStartOfStatement = false;
274     return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
275   }
276 
277   // C Style comment.
278   ++CurPtr;  // skip the star.
279   const char *CommentTextStart = CurPtr;
280   while (CurPtr != CurBuf.end()) {
281     switch (*CurPtr++) {
282     case '*':
283       // End of the comment?
284       if (*CurPtr != '/')
285         break;
286       // If we have a CommentConsumer, notify it about the comment.
287       if (CommentConsumer) {
288         CommentConsumer->HandleComment(
289             SMLoc::getFromPointer(CommentTextStart),
290             StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
291       }
292       ++CurPtr;   // End the */.
293       return AsmToken(AsmToken::Comment,
294                       StringRef(TokStart, CurPtr - TokStart));
295     }
296   }
297   return ReturnError(TokStart, "unterminated comment");
298 }
299 
300 /// LexLineComment: Comment: #[^\n]*
301 ///                        : //[^\n]*
LexLineComment()302 AsmToken AsmLexer::LexLineComment() {
303   // Mark This as an end of statement with a body of the
304   // comment. While it would be nicer to leave this two tokens,
305   // backwards compatability with TargetParsers makes keeping this in this form
306   // better.
307   const char *CommentTextStart = CurPtr;
308   int CurChar = getNextChar();
309   while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
310     CurChar = getNextChar();
311   const char *NewlinePtr = CurPtr;
312   if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
313     ++CurPtr;
314 
315   // If we have a CommentConsumer, notify it about the comment.
316   if (CommentConsumer) {
317     CommentConsumer->HandleComment(
318         SMLoc::getFromPointer(CommentTextStart),
319         StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
320   }
321 
322   IsAtStartOfLine = true;
323   // This is a whole line comment. leave newline
324   if (IsAtStartOfStatement)
325     return AsmToken(AsmToken::EndOfStatement,
326                     StringRef(TokStart, CurPtr - TokStart));
327   IsAtStartOfStatement = true;
328 
329   return AsmToken(AsmToken::EndOfStatement,
330                   StringRef(TokStart, CurPtr - 1 - TokStart));
331 }
332 
SkipIgnoredIntegerSuffix(const char * & CurPtr)333 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
334   // Skip case-insensitive ULL, UL, U, L and LL suffixes.
335   if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
336     ++CurPtr;
337   if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
338     ++CurPtr;
339   if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
340     ++CurPtr;
341 }
342 
343 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
344 // integer as a hexadecimal, possibly with leading zeroes.
doHexLookAhead(const char * & CurPtr,unsigned DefaultRadix,bool LexHex)345 static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
346                                bool LexHex) {
347   const char *FirstNonDec = nullptr;
348   const char *LookAhead = CurPtr;
349   while (true) {
350     if (isDigit(*LookAhead)) {
351       ++LookAhead;
352     } else {
353       if (!FirstNonDec)
354         FirstNonDec = LookAhead;
355 
356       // Keep going if we are looking for a 'h' suffix.
357       if (LexHex && isHexDigit(*LookAhead))
358         ++LookAhead;
359       else
360         break;
361     }
362   }
363   bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
364   CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
365   if (isHex)
366     return 16;
367   return DefaultRadix;
368 }
369 
findLastDigit(const char * CurPtr,unsigned DefaultRadix)370 static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
371   while (hexDigitValue(*CurPtr) < DefaultRadix) {
372     ++CurPtr;
373   }
374   return CurPtr;
375 }
376 
intToken(StringRef Ref,APInt & Value)377 static AsmToken intToken(StringRef Ref, APInt &Value) {
378   if (Value.isIntN(64))
379     return AsmToken(AsmToken::Integer, Ref, Value);
380   return AsmToken(AsmToken::BigNum, Ref, Value);
381 }
382 
radixName(unsigned Radix)383 static std::string radixName(unsigned Radix) {
384   switch (Radix) {
385   case 2:
386     return "binary";
387   case 8:
388     return "octal";
389   case 10:
390     return "decimal";
391   case 16:
392     return "hexadecimal";
393   default:
394     return "base-" + std::to_string(Radix);
395   }
396 }
397 
398 /// LexDigit: First character is [0-9].
399 ///   Local Label: [0-9][:]
400 ///   Forward/Backward Label: [0-9][fb]
401 ///   Binary integer: 0b[01]+
402 ///   Octal integer: 0[0-7]+
403 ///   Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
404 ///   Decimal integer: [1-9][0-9]*
LexDigit()405 AsmToken AsmLexer::LexDigit() {
406   // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
407   // MASM-flavor octal integer: [0-7]+[oOqQ]
408   // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
409   // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
410   if (LexMasmIntegers && isdigit(CurPtr[-1])) {
411     const char *FirstNonBinary =
412         (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
413     const char *FirstNonDecimal =
414         (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
415     const char *OldCurPtr = CurPtr;
416     while (isHexDigit(*CurPtr)) {
417       switch (*CurPtr) {
418       default:
419         if (!FirstNonDecimal) {
420           FirstNonDecimal = CurPtr;
421         }
422         [[fallthrough]];
423       case '9':
424       case '8':
425       case '7':
426       case '6':
427       case '5':
428       case '4':
429       case '3':
430       case '2':
431         if (!FirstNonBinary) {
432           FirstNonBinary = CurPtr;
433         }
434         break;
435       case '1':
436       case '0':
437         break;
438       }
439       ++CurPtr;
440     }
441     if (*CurPtr == '.') {
442       // MASM float literals (other than hex floats) always contain a ".", and
443       // are always written in decimal.
444       ++CurPtr;
445       return LexFloatLiteral();
446     }
447 
448     if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
449       ++CurPtr;
450       return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
451     }
452 
453     unsigned Radix = 0;
454     if (*CurPtr == 'h' || *CurPtr == 'H') {
455       // hexadecimal number
456       ++CurPtr;
457       Radix = 16;
458     } else if (*CurPtr == 't' || *CurPtr == 'T') {
459       // decimal number
460       ++CurPtr;
461       Radix = 10;
462     } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
463                *CurPtr == 'Q') {
464       // octal number
465       ++CurPtr;
466       Radix = 8;
467     } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
468       // binary number
469       ++CurPtr;
470       Radix = 2;
471     } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
472                DefaultRadix < 14 &&
473                (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
474       Radix = 10;
475     } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
476                DefaultRadix < 12 &&
477                (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
478       Radix = 2;
479     }
480 
481     if (Radix) {
482       StringRef Result(TokStart, CurPtr - TokStart);
483       APInt Value(128, 0, true);
484 
485       if (Result.drop_back().getAsInteger(Radix, Value))
486         return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
487 
488       // MSVC accepts and ignores type suffices on integer literals.
489       SkipIgnoredIntegerSuffix(CurPtr);
490 
491       return intToken(Result, Value);
492     }
493 
494     // default-radix integers, or floating point numbers, fall through
495     CurPtr = OldCurPtr;
496   }
497 
498   // MASM default-radix integers: [0-9a-fA-F]+
499   // (All other integer literals have a radix specifier.)
500   if (LexMasmIntegers && UseMasmDefaultRadix) {
501     CurPtr = findLastDigit(CurPtr, 16);
502     StringRef Result(TokStart, CurPtr - TokStart);
503 
504     APInt Value(128, 0, true);
505     if (Result.getAsInteger(DefaultRadix, Value)) {
506       return ReturnError(TokStart,
507                          "invalid " + radixName(DefaultRadix) + " number");
508     }
509 
510     return intToken(Result, Value);
511   }
512 
513   // Motorola hex integers: $[0-9a-fA-F]+
514   if (LexMotorolaIntegers && CurPtr[-1] == '$') {
515     const char *NumStart = CurPtr;
516     while (isHexDigit(CurPtr[0]))
517       ++CurPtr;
518 
519     APInt Result(128, 0);
520     if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result))
521       return ReturnError(TokStart, "invalid hexadecimal number");
522 
523     return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
524   }
525 
526   // Motorola binary integers: %[01]+
527   if (LexMotorolaIntegers && CurPtr[-1] == '%') {
528     const char *NumStart = CurPtr;
529     while (*CurPtr == '0' || *CurPtr == '1')
530       ++CurPtr;
531 
532     APInt Result(128, 0);
533     if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result))
534       return ReturnError(TokStart, "invalid binary number");
535 
536     return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
537   }
538 
539   // Decimal integer: [1-9][0-9]*
540   // HLASM-flavour decimal integer: [0-9][0-9]*
541   // FIXME: Later on, support for fb for HLASM has to be added in
542   // as they probably would be needed for asm goto
543   if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
544     unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
545 
546     if (!LexHLASMIntegers) {
547       bool IsHex = Radix == 16;
548       // Check for floating point literals.
549       if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
550         if (*CurPtr == '.')
551           ++CurPtr;
552         return LexFloatLiteral();
553       }
554     }
555 
556     StringRef Result(TokStart, CurPtr - TokStart);
557 
558     APInt Value(128, 0, true);
559     if (Result.getAsInteger(Radix, Value))
560       return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
561 
562     if (!LexHLASMIntegers)
563       // The darwin/x86 (and x86-64) assembler accepts and ignores type
564       // suffices on integer literals.
565       SkipIgnoredIntegerSuffix(CurPtr);
566 
567     return intToken(Result, Value);
568   }
569 
570   if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
571     ++CurPtr;
572     // See if we actually have "0b" as part of something like "jmp 0b\n"
573     if (!isDigit(CurPtr[0])) {
574       --CurPtr;
575       StringRef Result(TokStart, CurPtr - TokStart);
576       return AsmToken(AsmToken::Integer, Result, 0);
577     }
578     const char *NumStart = CurPtr;
579     while (CurPtr[0] == '0' || CurPtr[0] == '1')
580       ++CurPtr;
581 
582     // Requires at least one binary digit.
583     if (CurPtr == NumStart)
584       return ReturnError(TokStart, "invalid binary number");
585 
586     StringRef Result(TokStart, CurPtr - TokStart);
587 
588     APInt Value(128, 0, true);
589     if (Result.substr(2).getAsInteger(2, Value))
590       return ReturnError(TokStart, "invalid binary number");
591 
592     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
593     // suffixes on integer literals.
594     SkipIgnoredIntegerSuffix(CurPtr);
595 
596     return intToken(Result, Value);
597   }
598 
599   if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
600     ++CurPtr;
601     const char *NumStart = CurPtr;
602     while (isHexDigit(CurPtr[0]))
603       ++CurPtr;
604 
605     // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
606     // diagnosed by LexHexFloatLiteral).
607     if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
608       return LexHexFloatLiteral(NumStart == CurPtr);
609 
610     // Otherwise requires at least one hex digit.
611     if (CurPtr == NumStart)
612       return ReturnError(CurPtr-2, "invalid hexadecimal number");
613 
614     APInt Result(128, 0);
615     if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
616       return ReturnError(TokStart, "invalid hexadecimal number");
617 
618     // Consume the optional [hH].
619     if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
620       ++CurPtr;
621 
622     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
623     // suffixes on integer literals.
624     SkipIgnoredIntegerSuffix(CurPtr);
625 
626     return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
627   }
628 
629   // Either octal or hexadecimal.
630   APInt Value(128, 0, true);
631   unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
632   StringRef Result(TokStart, CurPtr - TokStart);
633   if (Result.getAsInteger(Radix, Value))
634     return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
635 
636   // Consume the [hH].
637   if (Radix == 16)
638     ++CurPtr;
639 
640   // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
641   // suffixes on integer literals.
642   SkipIgnoredIntegerSuffix(CurPtr);
643 
644   return intToken(Result, Value);
645 }
646 
647 /// LexSingleQuote: Integer: 'b'
LexSingleQuote()648 AsmToken AsmLexer::LexSingleQuote() {
649   int CurChar = getNextChar();
650 
651   if (LexHLASMStrings)
652     return ReturnError(TokStart, "invalid usage of character literals");
653 
654   if (LexMasmStrings) {
655     while (CurChar != EOF) {
656       if (CurChar != '\'') {
657         CurChar = getNextChar();
658       } else if (peekNextChar() == '\'') {
659         // In MASM single-quote strings, doubled single-quotes mean an escaped
660         // single quote, so should be lexed in.
661         (void)getNextChar();
662         CurChar = getNextChar();
663       } else {
664         break;
665       }
666     }
667     if (CurChar == EOF)
668       return ReturnError(TokStart, "unterminated string constant");
669     return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
670   }
671 
672   if (CurChar == '\\')
673     CurChar = getNextChar();
674 
675   if (CurChar == EOF)
676     return ReturnError(TokStart, "unterminated single quote");
677 
678   CurChar = getNextChar();
679 
680   if (CurChar != '\'')
681     return ReturnError(TokStart, "single quote way too long");
682 
683   // The idea here being that 'c' is basically just an integral
684   // constant.
685   StringRef Res = StringRef(TokStart,CurPtr - TokStart);
686   long long Value;
687 
688   if (Res.starts_with("\'\\")) {
689     char theChar = Res[2];
690     switch (theChar) {
691       default: Value = theChar; break;
692       case '\'': Value = '\''; break;
693       case 't': Value = '\t'; break;
694       case 'n': Value = '\n'; break;
695       case 'b': Value = '\b'; break;
696       case 'f': Value = '\f'; break;
697       case 'r': Value = '\r'; break;
698     }
699   } else
700     Value = TokStart[1];
701 
702   return AsmToken(AsmToken::Integer, Res, Value);
703 }
704 
705 /// LexQuote: String: "..."
LexQuote()706 AsmToken AsmLexer::LexQuote() {
707   int CurChar = getNextChar();
708   if (LexHLASMStrings)
709     return ReturnError(TokStart, "invalid usage of string literals");
710 
711   if (LexMasmStrings) {
712     while (CurChar != EOF) {
713       if (CurChar != '"') {
714         CurChar = getNextChar();
715       } else if (peekNextChar() == '"') {
716         // In MASM double-quoted strings, doubled double-quotes mean an escaped
717         // double quote, so should be lexed in.
718         (void)getNextChar();
719         CurChar = getNextChar();
720       } else {
721         break;
722       }
723     }
724     if (CurChar == EOF)
725       return ReturnError(TokStart, "unterminated string constant");
726     return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
727   }
728 
729   while (CurChar != '"') {
730     if (CurChar == '\\') {
731       // Allow \", etc.
732       CurChar = getNextChar();
733     }
734 
735     if (CurChar == EOF)
736       return ReturnError(TokStart, "unterminated string constant");
737 
738     CurChar = getNextChar();
739   }
740 
741   return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
742 }
743 
LexUntilEndOfStatement()744 StringRef AsmLexer::LexUntilEndOfStatement() {
745   TokStart = CurPtr;
746 
747   while (!isAtStartOfComment(CurPtr) &&     // Start of line comment.
748          !isAtStatementSeparator(CurPtr) && // End of statement marker.
749          *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
750     ++CurPtr;
751   }
752   return StringRef(TokStart, CurPtr-TokStart);
753 }
754 
LexUntilEndOfLine()755 StringRef AsmLexer::LexUntilEndOfLine() {
756   TokStart = CurPtr;
757 
758   while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
759     ++CurPtr;
760   }
761   return StringRef(TokStart, CurPtr-TokStart);
762 }
763 
peekTokens(MutableArrayRef<AsmToken> Buf,bool ShouldSkipSpace)764 size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
765                             bool ShouldSkipSpace) {
766   SaveAndRestore SavedTokenStart(TokStart);
767   SaveAndRestore SavedCurPtr(CurPtr);
768   SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
769   SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
770   SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
771   SaveAndRestore SavedIsPeeking(IsPeeking, true);
772   std::string SavedErr = getErr();
773   SMLoc SavedErrLoc = getErrLoc();
774 
775   size_t ReadCount;
776   for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
777     AsmToken Token = LexToken();
778 
779     Buf[ReadCount] = Token;
780 
781     if (Token.is(AsmToken::Eof)) {
782       ReadCount++;
783       break;
784     }
785   }
786 
787   SetError(SavedErrLoc, SavedErr);
788   return ReadCount;
789 }
790 
isAtStartOfComment(const char * Ptr)791 bool AsmLexer::isAtStartOfComment(const char *Ptr) {
792   if (MAI.isHLASM() && !IsAtStartOfStatement)
793     return false;
794 
795   StringRef CommentString = MAI.getCommentString();
796 
797   if (CommentString.size() == 1)
798     return CommentString[0] == Ptr[0];
799 
800   // Allow # preprocessor comments also be counted as comments for "##" cases
801   if (CommentString[1] == '#')
802     return CommentString[0] == Ptr[0];
803 
804   return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0;
805 }
806 
isAtStatementSeparator(const char * Ptr)807 bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
808   return strncmp(Ptr, MAI.getSeparatorString(),
809                  strlen(MAI.getSeparatorString())) == 0;
810 }
811 
LexToken()812 AsmToken AsmLexer::LexToken() {
813   TokStart = CurPtr;
814   // This always consumes at least one character.
815   int CurChar = getNextChar();
816 
817   if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
818     // If this starts with a '#', this may be a cpp
819     // hash directive and otherwise a line comment.
820     AsmToken TokenBuf[2];
821     MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
822     size_t num = peekTokens(Buf, true);
823     // There cannot be a space preceding this
824     if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
825         TokenBuf[1].is(AsmToken::String)) {
826       CurPtr = TokStart; // reset curPtr;
827       StringRef s = LexUntilEndOfLine();
828       UnLex(TokenBuf[1]);
829       UnLex(TokenBuf[0]);
830       return AsmToken(AsmToken::HashDirective, s);
831     }
832 
833     if (MAI.shouldAllowAdditionalComments())
834       return LexLineComment();
835   }
836 
837   if (isAtStartOfComment(TokStart)) {
838     CurPtr += MAI.getCommentString().size() - 1;
839     return LexLineComment();
840   }
841 
842   if (isAtStatementSeparator(TokStart)) {
843     CurPtr += strlen(MAI.getSeparatorString()) - 1;
844     IsAtStartOfLine = true;
845     IsAtStartOfStatement = true;
846     return AsmToken(AsmToken::EndOfStatement,
847                     StringRef(TokStart, strlen(MAI.getSeparatorString())));
848   }
849 
850   // If we're missing a newline at EOF, make sure we still get an
851   // EndOfStatement token before the Eof token.
852   if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
853     IsAtStartOfLine = true;
854     IsAtStartOfStatement = true;
855     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
856   }
857   IsAtStartOfLine = false;
858   bool OldIsAtStartOfStatement = IsAtStartOfStatement;
859   IsAtStartOfStatement = false;
860   switch (CurChar) {
861   default:
862     // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
863     // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
864     // an identifier is target-dependent. These characters are handled in the
865     // respective switch cases.
866     if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
867       return LexIdentifier();
868 
869     // Unknown character, emit an error.
870     return ReturnError(TokStart, "invalid character in input");
871   case EOF:
872     if (EndStatementAtEOF) {
873       IsAtStartOfLine = true;
874       IsAtStartOfStatement = true;
875     }
876     return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
877   case 0:
878   case ' ':
879   case '\t':
880     IsAtStartOfStatement = OldIsAtStartOfStatement;
881     while (*CurPtr == ' ' || *CurPtr == '\t')
882       CurPtr++;
883     if (SkipSpace)
884       return LexToken(); // Ignore whitespace.
885     else
886       return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
887   case '\r': {
888     IsAtStartOfLine = true;
889     IsAtStartOfStatement = true;
890     // If this is a CR followed by LF, treat that as one token.
891     if (CurPtr != CurBuf.end() && *CurPtr == '\n')
892       ++CurPtr;
893     return AsmToken(AsmToken::EndOfStatement,
894                     StringRef(TokStart, CurPtr - TokStart));
895   }
896   case '\n':
897     IsAtStartOfLine = true;
898     IsAtStartOfStatement = true;
899     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
900   case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
901   case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
902   case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
903   case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
904   case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
905   case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
906   case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
907   case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
908   case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
909   case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
910   case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
911   case '$': {
912     if (LexMotorolaIntegers && isHexDigit(*CurPtr))
913       return LexDigit();
914     if (MAI.doesAllowDollarAtStartOfIdentifier())
915       return LexIdentifier();
916     return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
917   }
918   case '@':
919     if (MAI.doesAllowAtAtStartOfIdentifier())
920       return LexIdentifier();
921     return AsmToken(AsmToken::At, StringRef(TokStart, 1));
922   case '#':
923     if (MAI.isHLASM())
924       return LexIdentifier();
925     return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
926   case '?':
927     if (MAI.doesAllowQuestionAtStartOfIdentifier())
928       return LexIdentifier();
929     return AsmToken(AsmToken::Question, StringRef(TokStart, 1));
930   case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
931   case '=':
932     if (*CurPtr == '=') {
933       ++CurPtr;
934       return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
935     }
936     return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
937   case '-':
938     if (*CurPtr == '>') {
939       ++CurPtr;
940       return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
941     }
942     return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
943   case '|':
944     if (*CurPtr == '|') {
945       ++CurPtr;
946       return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
947     }
948     return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
949   case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
950   case '&':
951     if (*CurPtr == '&') {
952       ++CurPtr;
953       return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
954     }
955     return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
956   case '!':
957     if (*CurPtr == '=') {
958       ++CurPtr;
959       return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
960     }
961     return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
962   case '%':
963     if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
964       return LexDigit();
965     }
966     return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
967   case '/':
968     IsAtStartOfStatement = OldIsAtStartOfStatement;
969     return LexSlash();
970   case '\'': return LexSingleQuote();
971   case '"': return LexQuote();
972   case '0': case '1': case '2': case '3': case '4':
973   case '5': case '6': case '7': case '8': case '9':
974     return LexDigit();
975   case '<':
976     switch (*CurPtr) {
977     case '<':
978       ++CurPtr;
979       return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
980     case '=':
981       ++CurPtr;
982       return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
983     case '>':
984       ++CurPtr;
985       return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
986     default:
987       return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
988     }
989   case '>':
990     switch (*CurPtr) {
991     case '>':
992       ++CurPtr;
993       return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
994     case '=':
995       ++CurPtr;
996       return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
997     default:
998       return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
999     }
1000 
1001   // TODO: Quoted identifiers (objc methods etc)
1002   // local labels: [0-9][:]
1003   // Forward/backward labels: [0-9][fb]
1004   // Integers, fp constants, character constants.
1005   }
1006 }
1007