1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implement the Lexer for TableGen.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "TGLexer.h"
14 #include "llvm/ADT/ArrayRef.h"
15 #include "llvm/ADT/StringExtras.h"
16 #include "llvm/ADT/StringSwitch.h"
17 #include "llvm/ADT/Twine.h"
18 #include "llvm/Config/config.h" // for strtoull()/strtoll() define
19 #include "llvm/Support/Compiler.h"
20 #include "llvm/Support/MemoryBuffer.h"
21 #include "llvm/Support/SourceMgr.h"
22 #include "llvm/TableGen/Error.h"
23 #include <cerrno>
24 #include <cstdio>
25 #include <cstdlib>
26 #include <cstring>
27
28 using namespace llvm;
29
30 namespace {
31 // A list of supported preprocessing directives with their
32 // internal token kinds and names.
33 struct PreprocessorDir {
34 tgtok::TokKind Kind;
35 StringRef Word;
36 };
37 } // end anonymous namespace
38
39 /// Returns true if `C` is a valid character in an identifier. If `First` is
40 /// true, returns true if `C` is a valid first character of an identifier,
41 /// else returns true if `C` is a valid non-first character of an identifier.
42 /// Identifiers match the following regular expression:
43 /// [a-zA-Z_][0-9a-zA-Z_]*
isValidIDChar(char C,bool First)44 static bool isValidIDChar(char C, bool First) {
45 if (C == '_' || isAlpha(C))
46 return true;
47 return !First && isDigit(C);
48 }
49
50 constexpr PreprocessorDir PreprocessorDirs[] = {{tgtok::Ifdef, "ifdef"},
51 {tgtok::Ifndef, "ifndef"},
52 {tgtok::Else, "else"},
53 {tgtok::Endif, "endif"},
54 {tgtok::Define, "define"}};
55
56 // Returns a pointer past the end of a valid macro name at the start of `Str`.
57 // Valid macro names match the regular expression [a-zA-Z_][0-9a-zA-Z_]*.
lexMacroName(StringRef Str)58 static const char *lexMacroName(StringRef Str) {
59 assert(!Str.empty());
60
61 // Macro names start with [a-zA-Z_].
62 const char *Next = Str.begin();
63 if (!isValidIDChar(*Next, /*First=*/true))
64 return Next;
65 // Eat the first character of the name.
66 ++Next;
67
68 // Match the rest of the identifier regex: [0-9a-zA-Z_]*
69 const char *End = Str.end();
70 while (Next != End && isValidIDChar(*Next, /*First=*/false))
71 ++Next;
72 return Next;
73 }
74
TGLexer(SourceMgr & SM,ArrayRef<std::string> Macros)75 TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) {
76 CurBuffer = SrcMgr.getMainFileID();
77 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
78 CurPtr = CurBuf.begin();
79 TokStart = nullptr;
80
81 // Pretend that we enter the "top-level" include file.
82 PrepIncludeStack.emplace_back();
83
84 // Add all macros defined on the command line to the DefinedMacros set.
85 // Check invalid macro names and print fatal error if we find one.
86 for (StringRef MacroName : Macros) {
87 const char *End = lexMacroName(MacroName);
88 if (End != MacroName.end())
89 PrintFatalError("invalid macro name `" + MacroName +
90 "` specified on command line");
91
92 DefinedMacros.insert(MacroName);
93 }
94 }
95
getLoc() const96 SMLoc TGLexer::getLoc() const {
97 return SMLoc::getFromPointer(TokStart);
98 }
99
getLocRange() const100 SMRange TGLexer::getLocRange() const {
101 return {getLoc(), SMLoc::getFromPointer(CurPtr)};
102 }
103
104 /// ReturnError - Set the error to the specified string at the specified
105 /// location. This is defined to always return tgtok::Error.
ReturnError(SMLoc Loc,const Twine & Msg)106 tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) {
107 PrintError(Loc, Msg);
108 return tgtok::Error;
109 }
110
ReturnError(const char * Loc,const Twine & Msg)111 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
112 return ReturnError(SMLoc::getFromPointer(Loc), Msg);
113 }
114
processEOF()115 bool TGLexer::processEOF() {
116 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
117 if (ParentIncludeLoc != SMLoc()) {
118 // If prepExitInclude() detects a problem with the preprocessing
119 // control stack, it will return false. Pretend that we reached
120 // the final EOF and stop lexing more tokens by returning false
121 // to LexToken().
122 if (!prepExitInclude(false))
123 return false;
124
125 CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
126 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
127 CurPtr = ParentIncludeLoc.getPointer();
128 // Make sure TokStart points into the parent file's buffer.
129 // LexToken() assigns to it before calling getNextChar(),
130 // so it is pointing into the included file now.
131 TokStart = CurPtr;
132 return true;
133 }
134
135 // Pretend that we exit the "top-level" include file.
136 // Note that in case of an error (e.g. control stack imbalance)
137 // the routine will issue a fatal error.
138 prepExitInclude(true);
139 return false;
140 }
141
getNextChar()142 int TGLexer::getNextChar() {
143 char CurChar = *CurPtr++;
144 switch (CurChar) {
145 default:
146 return (unsigned char)CurChar;
147
148 case 0: {
149 // A NUL character in the stream is either the end of the current buffer or
150 // a spurious NUL in the file. Disambiguate that here.
151 if (CurPtr - 1 == CurBuf.end()) {
152 --CurPtr; // Arrange for another call to return EOF again.
153 return EOF;
154 }
155 PrintError(getLoc(),
156 "NUL character is invalid in source; treated as space");
157 return ' ';
158 }
159
160 case '\n':
161 case '\r':
162 // Handle the newline character by ignoring it and incrementing the line
163 // count. However, be careful about 'dos style' files with \n\r in them.
164 // Only treat a \n\r or \r\n as a single line.
165 if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
166 *CurPtr != CurChar)
167 ++CurPtr; // Eat the two char newline sequence.
168 return '\n';
169 }
170 }
171
peekNextChar(int Index) const172 int TGLexer::peekNextChar(int Index) const {
173 return *(CurPtr + Index);
174 }
175
LexToken(bool FileOrLineStart)176 tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
177 TokStart = CurPtr;
178 // This always consumes at least one character.
179 int CurChar = getNextChar();
180
181 switch (CurChar) {
182 default:
183 // Handle letters: [a-zA-Z_]
184 if (isValidIDChar(CurChar, /*First=*/true))
185 return LexIdentifier();
186
187 // Unknown character, emit an error.
188 return ReturnError(TokStart, "unexpected character");
189 case EOF:
190 // Lex next token, if we just left an include file.
191 // Note that leaving an include file means that the next
192 // symbol is located at the end of the 'include "..."'
193 // construct, so LexToken() is called with default
194 // false parameter.
195 if (processEOF())
196 return LexToken();
197
198 // Return EOF denoting the end of lexing.
199 return tgtok::Eof;
200
201 case ':': return tgtok::colon;
202 case ';': return tgtok::semi;
203 case ',': return tgtok::comma;
204 case '<': return tgtok::less;
205 case '>': return tgtok::greater;
206 case ']': return tgtok::r_square;
207 case '{': return tgtok::l_brace;
208 case '}': return tgtok::r_brace;
209 case '(': return tgtok::l_paren;
210 case ')': return tgtok::r_paren;
211 case '=': return tgtok::equal;
212 case '?': return tgtok::question;
213 case '#':
214 if (FileOrLineStart) {
215 tgtok::TokKind Kind = prepIsDirective();
216 if (Kind != tgtok::Error)
217 return lexPreprocessor(Kind);
218 }
219
220 return tgtok::paste;
221
222 // The period is a separate case so we can recognize the "..."
223 // range punctuator.
224 case '.':
225 if (peekNextChar(0) == '.') {
226 ++CurPtr; // Eat second dot.
227 if (peekNextChar(0) == '.') {
228 ++CurPtr; // Eat third dot.
229 return tgtok::dotdotdot;
230 }
231 return ReturnError(TokStart, "invalid '..' punctuation");
232 }
233 return tgtok::dot;
234
235 case '\r':
236 llvm_unreachable("getNextChar() must never return '\r'");
237
238 case ' ':
239 case '\t':
240 // Ignore whitespace.
241 return LexToken(FileOrLineStart);
242 case '\n':
243 // Ignore whitespace, and identify the new line.
244 return LexToken(true);
245 case '/':
246 // If this is the start of a // comment, skip until the end of the line or
247 // the end of the buffer.
248 if (*CurPtr == '/')
249 SkipBCPLComment();
250 else if (*CurPtr == '*') {
251 if (SkipCComment())
252 return tgtok::Error;
253 } else // Otherwise, this is an error.
254 return ReturnError(TokStart, "unexpected character");
255 return LexToken(FileOrLineStart);
256 case '-': case '+':
257 case '0': case '1': case '2': case '3': case '4': case '5': case '6':
258 case '7': case '8': case '9': {
259 int NextChar = 0;
260 if (isDigit(CurChar)) {
261 // Allow identifiers to start with a number if it is followed by
262 // an identifier. This can happen with paste operations like
263 // foo#8i.
264 int i = 0;
265 do {
266 NextChar = peekNextChar(i++);
267 } while (isDigit(NextChar));
268
269 if (NextChar == 'x' || NextChar == 'b') {
270 // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
271 // likely a number.
272 int NextNextChar = peekNextChar(i);
273 switch (NextNextChar) {
274 default:
275 break;
276 case '0': case '1':
277 if (NextChar == 'b')
278 return LexNumber();
279 [[fallthrough]];
280 case '2': case '3': case '4': case '5':
281 case '6': case '7': case '8': case '9':
282 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
283 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
284 if (NextChar == 'x')
285 return LexNumber();
286 break;
287 }
288 }
289 }
290
291 if (isValidIDChar(NextChar, /*First=*/true))
292 return LexIdentifier();
293
294 return LexNumber();
295 }
296 case '"': return LexString();
297 case '$': return LexVarName();
298 case '[': return LexBracket();
299 case '!': return LexExclaim();
300 }
301 }
302
303 /// LexString - Lex "[^"]*"
LexString()304 tgtok::TokKind TGLexer::LexString() {
305 const char *StrStart = CurPtr;
306
307 CurStrVal = "";
308
309 while (*CurPtr != '"') {
310 // If we hit the end of the buffer, report an error.
311 if (*CurPtr == 0 && CurPtr == CurBuf.end())
312 return ReturnError(StrStart, "end of file in string literal");
313
314 if (*CurPtr == '\n' || *CurPtr == '\r')
315 return ReturnError(StrStart, "end of line in string literal");
316
317 if (*CurPtr != '\\') {
318 CurStrVal += *CurPtr++;
319 continue;
320 }
321
322 ++CurPtr;
323
324 switch (*CurPtr) {
325 case '\\': case '\'': case '"':
326 // These turn into their literal character.
327 CurStrVal += *CurPtr++;
328 break;
329 case 't':
330 CurStrVal += '\t';
331 ++CurPtr;
332 break;
333 case 'n':
334 CurStrVal += '\n';
335 ++CurPtr;
336 break;
337
338 case '\n':
339 case '\r':
340 return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
341
342 // If we hit the end of the buffer, report an error.
343 case '\0':
344 if (CurPtr == CurBuf.end())
345 return ReturnError(StrStart, "end of file in string literal");
346 [[fallthrough]];
347 default:
348 return ReturnError(CurPtr, "invalid escape in string literal");
349 }
350 }
351
352 ++CurPtr;
353 return tgtok::StrVal;
354 }
355
LexVarName()356 tgtok::TokKind TGLexer::LexVarName() {
357 if (!isValidIDChar(CurPtr[0], /*First=*/true))
358 return ReturnError(TokStart, "invalid variable name");
359
360 // Otherwise, we're ok, consume the rest of the characters.
361 const char *VarNameStart = CurPtr++;
362
363 while (isValidIDChar(*CurPtr, /*First=*/false))
364 ++CurPtr;
365
366 CurStrVal.assign(VarNameStart, CurPtr);
367 return tgtok::VarName;
368 }
369
LexIdentifier()370 tgtok::TokKind TGLexer::LexIdentifier() {
371 // The first letter is [a-zA-Z_].
372 const char *IdentStart = TokStart;
373
374 // Match the rest of the identifier regex: [0-9a-zA-Z_]*
375 while (isValidIDChar(*CurPtr, /*First=*/false))
376 ++CurPtr;
377
378 // Check to see if this identifier is a reserved keyword.
379 StringRef Str(IdentStart, CurPtr-IdentStart);
380
381 tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
382 .Case("int", tgtok::Int)
383 .Case("bit", tgtok::Bit)
384 .Case("bits", tgtok::Bits)
385 .Case("string", tgtok::String)
386 .Case("list", tgtok::List)
387 .Case("code", tgtok::Code)
388 .Case("dag", tgtok::Dag)
389 .Case("class", tgtok::Class)
390 .Case("def", tgtok::Def)
391 .Case("true", tgtok::TrueVal)
392 .Case("false", tgtok::FalseVal)
393 .Case("foreach", tgtok::Foreach)
394 .Case("defm", tgtok::Defm)
395 .Case("defset", tgtok::Defset)
396 .Case("deftype", tgtok::Deftype)
397 .Case("multiclass", tgtok::MultiClass)
398 .Case("field", tgtok::Field)
399 .Case("let", tgtok::Let)
400 .Case("in", tgtok::In)
401 .Case("defvar", tgtok::Defvar)
402 .Case("include", tgtok::Include)
403 .Case("if", tgtok::If)
404 .Case("then", tgtok::Then)
405 .Case("else", tgtok::ElseKW)
406 .Case("assert", tgtok::Assert)
407 .Case("dump", tgtok::Dump)
408 .Default(tgtok::Id);
409
410 // A couple of tokens require special processing.
411 switch (Kind) {
412 case tgtok::Include:
413 if (LexInclude()) return tgtok::Error;
414 return Lex();
415 case tgtok::Id:
416 CurStrVal.assign(Str.begin(), Str.end());
417 break;
418 default:
419 break;
420 }
421
422 return Kind;
423 }
424
425 /// LexInclude - We just read the "include" token. Get the string token that
426 /// comes next and enter the include.
LexInclude()427 bool TGLexer::LexInclude() {
428 // The token after the include must be a string.
429 tgtok::TokKind Tok = LexToken();
430 if (Tok == tgtok::Error) return true;
431 if (Tok != tgtok::StrVal) {
432 PrintError(getLoc(), "expected filename after include");
433 return true;
434 }
435
436 // Get the string.
437 std::string Filename = CurStrVal;
438 std::string IncludedFile;
439
440 CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
441 IncludedFile);
442 if (!CurBuffer) {
443 PrintError(getLoc(), "could not find include file '" + Filename + "'");
444 return true;
445 }
446
447 Dependencies.insert(IncludedFile);
448 // Save the line number and lex buffer of the includer.
449 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
450 CurPtr = CurBuf.begin();
451
452 PrepIncludeStack.emplace_back();
453 return false;
454 }
455
456 /// SkipBCPLComment - Skip over the comment by finding the next CR or LF.
457 /// Or we may end up at the end of the buffer.
SkipBCPLComment()458 void TGLexer::SkipBCPLComment() {
459 ++CurPtr; // skip the second slash.
460 auto EOLPos = CurBuf.find_first_of("\r\n", CurPtr - CurBuf.data());
461 CurPtr = (EOLPos == StringRef::npos) ? CurBuf.end() : CurBuf.data() + EOLPos;
462 }
463
464 /// SkipCComment - This skips C-style /**/ comments. The only difference from C
465 /// is that we allow nesting.
SkipCComment()466 bool TGLexer::SkipCComment() {
467 ++CurPtr; // skip the star.
468 unsigned CommentDepth = 1;
469
470 while (true) {
471 int CurChar = getNextChar();
472 switch (CurChar) {
473 case EOF:
474 PrintError(TokStart, "unterminated comment");
475 return true;
476 case '*':
477 // End of the comment?
478 if (CurPtr[0] != '/') break;
479
480 ++CurPtr; // End the */.
481 if (--CommentDepth == 0)
482 return false;
483 break;
484 case '/':
485 // Start of a nested comment?
486 if (CurPtr[0] != '*') break;
487 ++CurPtr;
488 ++CommentDepth;
489 break;
490 }
491 }
492 }
493
494 /// LexNumber - Lex:
495 /// [-+]?[0-9]+
496 /// 0x[0-9a-fA-F]+
497 /// 0b[01]+
LexNumber()498 tgtok::TokKind TGLexer::LexNumber() {
499 unsigned Base = 0;
500 const char *NumStart;
501
502 // Check if it's a hex or a binary value.
503 if (CurPtr[-1] == '0') {
504 NumStart = CurPtr + 1;
505 if (CurPtr[0] == 'x') {
506 Base = 16;
507 do
508 ++CurPtr;
509 while (isHexDigit(CurPtr[0]));
510 } else if (CurPtr[0] == 'b') {
511 Base = 2;
512 do
513 ++CurPtr;
514 while (CurPtr[0] == '0' || CurPtr[0] == '1');
515 }
516 }
517
518 // For a hex or binary value, we always convert it to an unsigned value.
519 bool IsMinus = false;
520
521 // Check if it's a decimal value.
522 if (Base == 0) {
523 // Check for a sign without a digit.
524 if (!isDigit(CurPtr[0])) {
525 if (CurPtr[-1] == '-')
526 return tgtok::minus;
527 else if (CurPtr[-1] == '+')
528 return tgtok::plus;
529 }
530
531 Base = 10;
532 NumStart = TokStart;
533 IsMinus = CurPtr[-1] == '-';
534
535 while (isDigit(CurPtr[0]))
536 ++CurPtr;
537 }
538
539 // Requires at least one digit.
540 if (CurPtr == NumStart)
541 return ReturnError(TokStart, "invalid number");
542
543 errno = 0;
544 if (IsMinus)
545 CurIntVal = strtoll(NumStart, nullptr, Base);
546 else
547 CurIntVal = strtoull(NumStart, nullptr, Base);
548
549 if (errno == EINVAL)
550 return ReturnError(TokStart, "invalid number");
551 if (errno == ERANGE)
552 return ReturnError(TokStart, "number out of range");
553
554 return Base == 2 ? tgtok::BinaryIntVal : tgtok::IntVal;
555 }
556
557 /// LexBracket - We just read '['. If this is a code block, return it,
558 /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
LexBracket()559 tgtok::TokKind TGLexer::LexBracket() {
560 if (CurPtr[0] != '{')
561 return tgtok::l_square;
562 ++CurPtr;
563 const char *CodeStart = CurPtr;
564 while (true) {
565 int Char = getNextChar();
566 if (Char == EOF) break;
567
568 if (Char != '}') continue;
569
570 Char = getNextChar();
571 if (Char == EOF) break;
572 if (Char == ']') {
573 CurStrVal.assign(CodeStart, CurPtr-2);
574 return tgtok::CodeFragment;
575 }
576 }
577
578 return ReturnError(CodeStart - 2, "unterminated code block");
579 }
580
581 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
LexExclaim()582 tgtok::TokKind TGLexer::LexExclaim() {
583 if (!isAlpha(*CurPtr))
584 return ReturnError(CurPtr - 1, "invalid \"!operator\"");
585
586 const char *Start = CurPtr++;
587 while (isAlpha(*CurPtr))
588 ++CurPtr;
589
590 // Check to see which operator this is.
591 tgtok::TokKind Kind =
592 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
593 .Case("eq", tgtok::XEq)
594 .Case("ne", tgtok::XNe)
595 .Case("le", tgtok::XLe)
596 .Case("lt", tgtok::XLt)
597 .Case("ge", tgtok::XGe)
598 .Case("gt", tgtok::XGt)
599 .Case("if", tgtok::XIf)
600 .Case("cond", tgtok::XCond)
601 .Case("isa", tgtok::XIsA)
602 .Case("head", tgtok::XHead)
603 .Case("tail", tgtok::XTail)
604 .Case("size", tgtok::XSize)
605 .Case("con", tgtok::XConcat)
606 .Case("dag", tgtok::XDag)
607 .Case("add", tgtok::XADD)
608 .Case("sub", tgtok::XSUB)
609 .Case("mul", tgtok::XMUL)
610 .Case("div", tgtok::XDIV)
611 .Case("not", tgtok::XNOT)
612 .Case("logtwo", tgtok::XLOG2)
613 .Case("and", tgtok::XAND)
614 .Case("or", tgtok::XOR)
615 .Case("xor", tgtok::XXOR)
616 .Case("shl", tgtok::XSHL)
617 .Case("sra", tgtok::XSRA)
618 .Case("srl", tgtok::XSRL)
619 .Case("cast", tgtok::XCast)
620 .Case("empty", tgtok::XEmpty)
621 .Case("subst", tgtok::XSubst)
622 .Case("foldl", tgtok::XFoldl)
623 .Case("foreach", tgtok::XForEach)
624 .Case("filter", tgtok::XFilter)
625 .Case("listconcat", tgtok::XListConcat)
626 .Case("listflatten", tgtok::XListFlatten)
627 .Case("listsplat", tgtok::XListSplat)
628 .Case("listremove", tgtok::XListRemove)
629 .Case("range", tgtok::XRange)
630 .Case("strconcat", tgtok::XStrConcat)
631 .Case("initialized", tgtok::XInitialized)
632 .Case("interleave", tgtok::XInterleave)
633 .Case("instances", tgtok::XInstances)
634 .Case("substr", tgtok::XSubstr)
635 .Case("find", tgtok::XFind)
636 .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated.
637 .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated.
638 .Case("getdagarg", tgtok::XGetDagArg)
639 .Case("getdagname", tgtok::XGetDagName)
640 .Case("setdagarg", tgtok::XSetDagArg)
641 .Case("setdagname", tgtok::XSetDagName)
642 .Case("exists", tgtok::XExists)
643 .Case("tolower", tgtok::XToLower)
644 .Case("toupper", tgtok::XToUpper)
645 .Case("repr", tgtok::XRepr)
646 .Case("match", tgtok::XMatch)
647 .Default(tgtok::Error);
648
649 return Kind != tgtok::Error ? Kind
650 : ReturnError(Start - 1, "unknown operator");
651 }
652
prepExitInclude(bool IncludeStackMustBeEmpty)653 bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) {
654 // Report an error, if preprocessor control stack for the current
655 // file is not empty.
656 if (!PrepIncludeStack.back().empty()) {
657 prepReportPreprocessorStackError();
658
659 return false;
660 }
661
662 // Pop the preprocessing controls from the include stack.
663 PrepIncludeStack.pop_back();
664
665 if (IncludeStackMustBeEmpty) {
666 assert(PrepIncludeStack.empty() &&
667 "preprocessor include stack is not empty");
668 } else {
669 assert(!PrepIncludeStack.empty() && "preprocessor include stack is empty");
670 }
671
672 return true;
673 }
674
prepIsDirective() const675 tgtok::TokKind TGLexer::prepIsDirective() const {
676 for (const auto [Kind, Word] : PreprocessorDirs) {
677 if (StringRef(CurPtr, Word.size()) != Word)
678 continue;
679 int NextChar = peekNextChar(Word.size());
680
681 // Check for whitespace after the directive. If there is no whitespace,
682 // then we do not recognize it as a preprocessing directive.
683
684 // New line and EOF may follow only #else/#endif. It will be reported
685 // as an error for #ifdef/#define after the call to prepLexMacroName().
686 if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF ||
687 NextChar == '\n' ||
688 // It looks like TableGen does not support '\r' as the actual
689 // carriage return, e.g. getNextChar() treats a single '\r'
690 // as '\n'. So we do the same here.
691 NextChar == '\r')
692 return Kind;
693
694 // Allow comments after some directives, e.g.:
695 // #else// OR #else/**/
696 // #endif// OR #endif/**/
697 //
698 // Note that we do allow comments after #ifdef/#define here, e.g.
699 // #ifdef/**/ AND #ifdef//
700 // #define/**/ AND #define//
701 //
702 // These cases will be reported as incorrect after calling
703 // prepLexMacroName(). We could have supported C-style comments
704 // after #ifdef/#define, but this would complicate the code
705 // for little benefit.
706 if (NextChar == '/') {
707 NextChar = peekNextChar(Word.size() + 1);
708
709 if (NextChar == '*' || NextChar == '/')
710 return Kind;
711
712 // Pretend that we do not recognize the directive.
713 }
714 }
715
716 return tgtok::Error;
717 }
718
prepEatPreprocessorDirective(tgtok::TokKind Kind)719 void TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) {
720 TokStart = CurPtr;
721
722 for (const auto [PKind, PWord] : PreprocessorDirs) {
723 if (PKind == Kind) {
724 // Advance CurPtr to the end of the preprocessing word.
725 CurPtr += PWord.size();
726 return;
727 }
728 }
729
730 llvm_unreachable(
731 "unsupported preprocessing token in prepEatPreprocessorDirective()");
732 }
733
lexPreprocessor(tgtok::TokKind Kind,bool ReturnNextLiveToken)734 tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind,
735 bool ReturnNextLiveToken) {
736 // We must be looking at a preprocessing directive. Eat it!
737 prepEatPreprocessorDirective(Kind);
738
739 if (Kind == tgtok::Ifdef || Kind == tgtok::Ifndef) {
740 StringRef MacroName = prepLexMacroName();
741 StringRef IfTokName = Kind == tgtok::Ifdef ? "#ifdef" : "#ifndef";
742 if (MacroName.empty())
743 return ReturnError(TokStart, "expected macro name after " + IfTokName);
744
745 bool MacroIsDefined = DefinedMacros.count(MacroName) != 0;
746
747 // Canonicalize ifndef's MacroIsDefined to its ifdef equivalent.
748 if (Kind == tgtok::Ifndef)
749 MacroIsDefined = !MacroIsDefined;
750
751 // Regardless of whether we are processing tokens or not,
752 // we put the #ifdef control on stack.
753 // Note that MacroIsDefined has been canonicalized against ifdef.
754 PrepIncludeStack.back().push_back(
755 {tgtok::Ifdef, MacroIsDefined, SMLoc::getFromPointer(TokStart)});
756
757 if (!prepSkipDirectiveEnd())
758 return ReturnError(CurPtr, "only comments are supported after " +
759 IfTokName + " NAME");
760
761 // If we were not processing tokens before this #ifdef,
762 // then just return back to the lines skipping code.
763 if (!ReturnNextLiveToken)
764 return Kind;
765
766 // If we were processing tokens before this #ifdef,
767 // and the macro is defined, then just return the next token.
768 if (MacroIsDefined)
769 return LexToken();
770
771 // We were processing tokens before this #ifdef, and the macro
772 // is not defined, so we have to start skipping the lines.
773 // If the skipping is successful, it will return the token following
774 // either #else or #endif corresponding to this #ifdef.
775 if (prepSkipRegion(ReturnNextLiveToken))
776 return LexToken();
777
778 return tgtok::Error;
779 } else if (Kind == tgtok::Else) {
780 // Check if this #else is correct before calling prepSkipDirectiveEnd(),
781 // which will move CurPtr away from the beginning of #else.
782 if (PrepIncludeStack.back().empty())
783 return ReturnError(TokStart, "#else without #ifdef or #ifndef");
784
785 PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back().back();
786
787 if (IfdefEntry.Kind != tgtok::Ifdef) {
788 PrintError(TokStart, "double #else");
789 return ReturnError(IfdefEntry.SrcPos, "previous #else is here");
790 }
791
792 // Replace the corresponding #ifdef's control with its negation
793 // on the control stack.
794 PrepIncludeStack.back().back() = {Kind, !IfdefEntry.IsDefined,
795 SMLoc::getFromPointer(TokStart)};
796
797 if (!prepSkipDirectiveEnd())
798 return ReturnError(CurPtr, "only comments are supported after #else");
799
800 // If we were processing tokens before this #else,
801 // we have to start skipping lines until the matching #endif.
802 if (ReturnNextLiveToken) {
803 if (prepSkipRegion(ReturnNextLiveToken))
804 return LexToken();
805
806 return tgtok::Error;
807 }
808
809 // Return to the lines skipping code.
810 return Kind;
811 } else if (Kind == tgtok::Endif) {
812 // Check if this #endif is correct before calling prepSkipDirectiveEnd(),
813 // which will move CurPtr away from the beginning of #endif.
814 if (PrepIncludeStack.back().empty())
815 return ReturnError(TokStart, "#endif without #ifdef");
816
817 [[maybe_unused]] auto &IfdefOrElseEntry = PrepIncludeStack.back().back();
818
819 assert((IfdefOrElseEntry.Kind == tgtok::Ifdef ||
820 IfdefOrElseEntry.Kind == tgtok::Else) &&
821 "invalid preprocessor control on the stack");
822
823 if (!prepSkipDirectiveEnd())
824 return ReturnError(CurPtr, "only comments are supported after #endif");
825
826 PrepIncludeStack.back().pop_back();
827
828 // If we were processing tokens before this #endif, then
829 // we should continue it.
830 if (ReturnNextLiveToken) {
831 return LexToken();
832 }
833
834 // Return to the lines skipping code.
835 return Kind;
836 } else if (Kind == tgtok::Define) {
837 StringRef MacroName = prepLexMacroName();
838 if (MacroName.empty())
839 return ReturnError(TokStart, "expected macro name after #define");
840
841 if (!DefinedMacros.insert(MacroName).second)
842 PrintWarning(getLoc(),
843 "duplicate definition of macro: " + Twine(MacroName));
844
845 if (!prepSkipDirectiveEnd())
846 return ReturnError(CurPtr,
847 "only comments are supported after #define NAME");
848
849 assert(ReturnNextLiveToken &&
850 "#define must be ignored during the lines skipping");
851
852 return LexToken();
853 }
854
855 llvm_unreachable("preprocessing directive is not supported");
856 }
857
prepSkipRegion(bool MustNeverBeFalse)858 bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) {
859 assert(MustNeverBeFalse && "invalid recursion.");
860
861 do {
862 // Skip all symbols to the line end.
863 while (*CurPtr != '\n')
864 ++CurPtr;
865
866 // Find the first non-whitespace symbol in the next line(s).
867 if (!prepSkipLineBegin())
868 return false;
869
870 // If the first non-blank/comment symbol on the line is '#',
871 // it may be a start of preprocessing directive.
872 //
873 // If it is not '#' just go to the next line.
874 if (*CurPtr == '#')
875 ++CurPtr;
876 else
877 continue;
878
879 tgtok::TokKind Kind = prepIsDirective();
880
881 // If we did not find a preprocessing directive or it is #define,
882 // then just skip to the next line. We do not have to do anything
883 // for #define in the line-skipping mode.
884 if (Kind == tgtok::Error || Kind == tgtok::Define)
885 continue;
886
887 tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, false);
888
889 // If lexPreprocessor() encountered an error during lexing this
890 // preprocessor idiom, then return false to the calling lexPreprocessor().
891 // This will force tgtok::Error to be returned to the tokens processing.
892 if (ProcessedKind == tgtok::Error)
893 return false;
894
895 assert(Kind == ProcessedKind && "prepIsDirective() and lexPreprocessor() "
896 "returned different token kinds");
897
898 // If this preprocessing directive enables tokens processing,
899 // then return to the lexPreprocessor() and get to the next token.
900 // We can move from line-skipping mode to processing tokens only
901 // due to #else or #endif.
902 if (prepIsProcessingEnabled()) {
903 assert((Kind == tgtok::Else || Kind == tgtok::Endif) &&
904 "tokens processing was enabled by an unexpected preprocessing "
905 "directive");
906
907 return true;
908 }
909 } while (CurPtr != CurBuf.end());
910
911 // We have reached the end of the file, but never left the lines-skipping
912 // mode. This means there is no matching #endif.
913 prepReportPreprocessorStackError();
914 return false;
915 }
916
prepLexMacroName()917 StringRef TGLexer::prepLexMacroName() {
918 // Skip whitespaces between the preprocessing directive and the macro name.
919 while (*CurPtr == ' ' || *CurPtr == '\t')
920 ++CurPtr;
921
922 TokStart = CurPtr;
923 CurPtr = lexMacroName(StringRef(CurPtr, CurBuf.end() - CurPtr));
924 return StringRef(TokStart, CurPtr - TokStart);
925 }
926
prepSkipLineBegin()927 bool TGLexer::prepSkipLineBegin() {
928 while (CurPtr != CurBuf.end()) {
929 switch (*CurPtr) {
930 case ' ':
931 case '\t':
932 case '\n':
933 case '\r':
934 break;
935
936 case '/': {
937 int NextChar = peekNextChar(1);
938 if (NextChar == '*') {
939 // Skip C-style comment.
940 // Note that we do not care about skipping the C++-style comments.
941 // If the line contains "//", it may not contain any processable
942 // preprocessing directive. Just return CurPtr pointing to
943 // the first '/' in this case. We also do not care about
944 // incorrect symbols after the first '/' - we are in lines-skipping
945 // mode, so incorrect code is allowed to some extent.
946
947 // Set TokStart to the beginning of the comment to enable proper
948 // diagnostic printing in case of error in SkipCComment().
949 TokStart = CurPtr;
950
951 // CurPtr must point to '*' before call to SkipCComment().
952 ++CurPtr;
953 if (SkipCComment())
954 return false;
955 } else {
956 // CurPtr points to the non-whitespace '/'.
957 return true;
958 }
959
960 // We must not increment CurPtr after the comment was lexed.
961 continue;
962 }
963
964 default:
965 return true;
966 }
967
968 ++CurPtr;
969 }
970
971 // We have reached the end of the file. Return to the lines skipping
972 // code, and allow it to handle the EOF as needed.
973 return true;
974 }
975
prepSkipDirectiveEnd()976 bool TGLexer::prepSkipDirectiveEnd() {
977 while (CurPtr != CurBuf.end()) {
978 switch (*CurPtr) {
979 case ' ':
980 case '\t':
981 break;
982
983 case '\n':
984 case '\r':
985 return true;
986
987 case '/': {
988 int NextChar = peekNextChar(1);
989 if (NextChar == '/') {
990 // Skip C++-style comment.
991 // We may just return true now, but let's skip to the line/buffer end
992 // to simplify the method specification.
993 ++CurPtr;
994 SkipBCPLComment();
995 } else if (NextChar == '*') {
996 // When we are skipping C-style comment at the end of a preprocessing
997 // directive, we can skip several lines. If any meaningful TD token
998 // follows the end of the C-style comment on the same line, it will
999 // be considered as an invalid usage of TD token.
1000 // For example, we want to forbid usages like this one:
1001 // #define MACRO class Class {}
1002 // But with C-style comments we also disallow the following:
1003 // #define MACRO /* This macro is used
1004 // to ... */ class Class {}
1005 // One can argue that this should be allowed, but it does not seem
1006 // to be worth of the complication. Moreover, this matches
1007 // the C preprocessor behavior.
1008
1009 // Set TokStart to the beginning of the comment to enable proper
1010 // diagnostic printer in case of error in SkipCComment().
1011 TokStart = CurPtr;
1012 ++CurPtr;
1013 if (SkipCComment())
1014 return false;
1015 } else {
1016 TokStart = CurPtr;
1017 PrintError(CurPtr, "unexpected character");
1018 return false;
1019 }
1020
1021 // We must not increment CurPtr after the comment was lexed.
1022 continue;
1023 }
1024
1025 default:
1026 // Do not allow any non-whitespaces after the directive.
1027 TokStart = CurPtr;
1028 return false;
1029 }
1030
1031 ++CurPtr;
1032 }
1033
1034 return true;
1035 }
1036
prepIsProcessingEnabled()1037 bool TGLexer::prepIsProcessingEnabled() {
1038 return all_of(PrepIncludeStack.back(),
1039 [](const PreprocessorControlDesc &I) { return I.IsDefined; });
1040 }
1041
prepReportPreprocessorStackError()1042 void TGLexer::prepReportPreprocessorStackError() {
1043 auto &PrepControl = PrepIncludeStack.back().back();
1044 PrintError(CurBuf.end(), "reached EOF without matching #endif");
1045 PrintError(PrepControl.SrcPos, "the latest preprocessor control is here");
1046
1047 TokStart = CurPtr;
1048 }
1049