1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implement the Lexer for TableGen. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "TGLexer.h" 14 #include "llvm/ADT/ArrayRef.h" 15 #include "llvm/ADT/StringExtras.h" 16 #include "llvm/ADT/StringSwitch.h" 17 #include "llvm/ADT/Twine.h" 18 #include "llvm/Config/config.h" // for strtoull()/strtoll() define 19 #include "llvm/Support/Compiler.h" 20 #include "llvm/Support/MemoryBuffer.h" 21 #include "llvm/Support/SourceMgr.h" 22 #include "llvm/TableGen/Error.h" 23 #include <cerrno> 24 #include <cstdio> 25 #include <cstdlib> 26 #include <cstring> 27 28 using namespace llvm; 29 30 namespace { 31 // A list of supported preprocessing directives with their 32 // internal token kinds and names. 33 struct PreprocessorDir { 34 tgtok::TokKind Kind; 35 StringRef Word; 36 }; 37 } // end anonymous namespace 38 39 /// Returns true if `C` is a valid character in an identifier. If `First` is 40 /// true, returns true if `C` is a valid first character of an identifier, 41 /// else returns true if `C` is a valid non-first character of an identifier. 42 /// Identifiers match the following regular expression: 43 /// [a-zA-Z_][0-9a-zA-Z_]* 44 static bool isValidIDChar(char C, bool First) { 45 if (C == '_' || isAlpha(C)) 46 return true; 47 return !First && isDigit(C); 48 } 49 50 constexpr PreprocessorDir PreprocessorDirs[] = {{tgtok::Ifdef, "ifdef"}, 51 {tgtok::Ifndef, "ifndef"}, 52 {tgtok::Else, "else"}, 53 {tgtok::Endif, "endif"}, 54 {tgtok::Define, "define"}}; 55 56 // Returns a pointer past the end of a valid macro name at the start of `Str`. 57 // Valid macro names match the regular expression [a-zA-Z_][0-9a-zA-Z_]*. 58 static const char *lexMacroName(StringRef Str) { 59 assert(!Str.empty()); 60 61 // Macro names start with [a-zA-Z_]. 62 const char *Next = Str.begin(); 63 if (!isValidIDChar(*Next, /*First=*/true)) 64 return Next; 65 // Eat the first character of the name. 66 ++Next; 67 68 // Match the rest of the identifier regex: [0-9a-zA-Z_]* 69 const char *End = Str.end(); 70 while (Next != End && isValidIDChar(*Next, /*First=*/false)) 71 ++Next; 72 return Next; 73 } 74 75 TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) { 76 CurBuffer = SrcMgr.getMainFileID(); 77 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 78 CurPtr = CurBuf.begin(); 79 TokStart = nullptr; 80 81 // Pretend that we enter the "top-level" include file. 82 PrepIncludeStack.emplace_back(); 83 84 // Add all macros defined on the command line to the DefinedMacros set. 85 // Check invalid macro names and print fatal error if we find one. 86 for (StringRef MacroName : Macros) { 87 const char *End = lexMacroName(MacroName); 88 if (End != MacroName.end()) 89 PrintFatalError("invalid macro name `" + MacroName + 90 "` specified on command line"); 91 92 DefinedMacros.insert(MacroName); 93 } 94 } 95 96 SMLoc TGLexer::getLoc() const { 97 return SMLoc::getFromPointer(TokStart); 98 } 99 100 SMRange TGLexer::getLocRange() const { 101 return {getLoc(), SMLoc::getFromPointer(CurPtr)}; 102 } 103 104 /// ReturnError - Set the error to the specified string at the specified 105 /// location. This is defined to always return tgtok::Error. 106 tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) { 107 PrintError(Loc, Msg); 108 return tgtok::Error; 109 } 110 111 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { 112 return ReturnError(SMLoc::getFromPointer(Loc), Msg); 113 } 114 115 bool TGLexer::processEOF() { 116 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); 117 if (ParentIncludeLoc != SMLoc()) { 118 // If prepExitInclude() detects a problem with the preprocessing 119 // control stack, it will return false. Pretend that we reached 120 // the final EOF and stop lexing more tokens by returning false 121 // to LexToken(). 122 if (!prepExitInclude(false)) 123 return false; 124 125 CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); 126 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 127 CurPtr = ParentIncludeLoc.getPointer(); 128 // Make sure TokStart points into the parent file's buffer. 129 // LexToken() assigns to it before calling getNextChar(), 130 // so it is pointing into the included file now. 131 TokStart = CurPtr; 132 return true; 133 } 134 135 // Pretend that we exit the "top-level" include file. 136 // Note that in case of an error (e.g. control stack imbalance) 137 // the routine will issue a fatal error. 138 prepExitInclude(true); 139 return false; 140 } 141 142 int TGLexer::getNextChar() { 143 char CurChar = *CurPtr++; 144 switch (CurChar) { 145 default: 146 return (unsigned char)CurChar; 147 148 case 0: { 149 // A NUL character in the stream is either the end of the current buffer or 150 // a spurious NUL in the file. Disambiguate that here. 151 if (CurPtr - 1 == CurBuf.end()) { 152 --CurPtr; // Arrange for another call to return EOF again. 153 return EOF; 154 } 155 PrintError(getLoc(), 156 "NUL character is invalid in source; treated as space"); 157 return ' '; 158 } 159 160 case '\n': 161 case '\r': 162 // Handle the newline character by ignoring it and incrementing the line 163 // count. However, be careful about 'dos style' files with \n\r in them. 164 // Only treat a \n\r or \r\n as a single line. 165 if ((*CurPtr == '\n' || (*CurPtr == '\r')) && 166 *CurPtr != CurChar) 167 ++CurPtr; // Eat the two char newline sequence. 168 return '\n'; 169 } 170 } 171 172 int TGLexer::peekNextChar(int Index) const { 173 return *(CurPtr + Index); 174 } 175 176 tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) { 177 TokStart = CurPtr; 178 // This always consumes at least one character. 179 int CurChar = getNextChar(); 180 181 switch (CurChar) { 182 default: 183 // Handle letters: [a-zA-Z_] 184 if (isValidIDChar(CurChar, /*First=*/true)) 185 return LexIdentifier(); 186 187 // Unknown character, emit an error. 188 return ReturnError(TokStart, "unexpected character"); 189 case EOF: 190 // Lex next token, if we just left an include file. 191 // Note that leaving an include file means that the next 192 // symbol is located at the end of the 'include "..."' 193 // construct, so LexToken() is called with default 194 // false parameter. 195 if (processEOF()) 196 return LexToken(); 197 198 // Return EOF denoting the end of lexing. 199 return tgtok::Eof; 200 201 case ':': return tgtok::colon; 202 case ';': return tgtok::semi; 203 case ',': return tgtok::comma; 204 case '<': return tgtok::less; 205 case '>': return tgtok::greater; 206 case ']': return tgtok::r_square; 207 case '{': return tgtok::l_brace; 208 case '}': return tgtok::r_brace; 209 case '(': return tgtok::l_paren; 210 case ')': return tgtok::r_paren; 211 case '=': return tgtok::equal; 212 case '?': return tgtok::question; 213 case '#': 214 if (FileOrLineStart) { 215 tgtok::TokKind Kind = prepIsDirective(); 216 if (Kind != tgtok::Error) 217 return lexPreprocessor(Kind); 218 } 219 220 return tgtok::paste; 221 222 // The period is a separate case so we can recognize the "..." 223 // range punctuator. 224 case '.': 225 if (peekNextChar(0) == '.') { 226 ++CurPtr; // Eat second dot. 227 if (peekNextChar(0) == '.') { 228 ++CurPtr; // Eat third dot. 229 return tgtok::dotdotdot; 230 } 231 return ReturnError(TokStart, "invalid '..' punctuation"); 232 } 233 return tgtok::dot; 234 235 case '\r': 236 llvm_unreachable("getNextChar() must never return '\r'"); 237 238 case ' ': 239 case '\t': 240 // Ignore whitespace. 241 return LexToken(FileOrLineStart); 242 case '\n': 243 // Ignore whitespace, and identify the new line. 244 return LexToken(true); 245 case '/': 246 // If this is the start of a // comment, skip until the end of the line or 247 // the end of the buffer. 248 if (*CurPtr == '/') 249 SkipBCPLComment(); 250 else if (*CurPtr == '*') { 251 if (SkipCComment()) 252 return tgtok::Error; 253 } else // Otherwise, this is an error. 254 return ReturnError(TokStart, "unexpected character"); 255 return LexToken(FileOrLineStart); 256 case '-': case '+': 257 case '0': case '1': case '2': case '3': case '4': case '5': case '6': 258 case '7': case '8': case '9': { 259 int NextChar = 0; 260 if (isDigit(CurChar)) { 261 // Allow identifiers to start with a number if it is followed by 262 // an identifier. This can happen with paste operations like 263 // foo#8i. 264 int i = 0; 265 do { 266 NextChar = peekNextChar(i++); 267 } while (isDigit(NextChar)); 268 269 if (NextChar == 'x' || NextChar == 'b') { 270 // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most 271 // likely a number. 272 int NextNextChar = peekNextChar(i); 273 switch (NextNextChar) { 274 default: 275 break; 276 case '0': case '1': 277 if (NextChar == 'b') 278 return LexNumber(); 279 [[fallthrough]]; 280 case '2': case '3': case '4': case '5': 281 case '6': case '7': case '8': case '9': 282 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 283 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 284 if (NextChar == 'x') 285 return LexNumber(); 286 break; 287 } 288 } 289 } 290 291 if (isValidIDChar(NextChar, /*First=*/true)) 292 return LexIdentifier(); 293 294 return LexNumber(); 295 } 296 case '"': return LexString(); 297 case '$': return LexVarName(); 298 case '[': return LexBracket(); 299 case '!': return LexExclaim(); 300 } 301 } 302 303 /// LexString - Lex "[^"]*" 304 tgtok::TokKind TGLexer::LexString() { 305 const char *StrStart = CurPtr; 306 307 CurStrVal = ""; 308 309 while (*CurPtr != '"') { 310 // If we hit the end of the buffer, report an error. 311 if (*CurPtr == 0 && CurPtr == CurBuf.end()) 312 return ReturnError(StrStart, "end of file in string literal"); 313 314 if (*CurPtr == '\n' || *CurPtr == '\r') 315 return ReturnError(StrStart, "end of line in string literal"); 316 317 if (*CurPtr != '\\') { 318 CurStrVal += *CurPtr++; 319 continue; 320 } 321 322 ++CurPtr; 323 324 switch (*CurPtr) { 325 case '\\': case '\'': case '"': 326 // These turn into their literal character. 327 CurStrVal += *CurPtr++; 328 break; 329 case 't': 330 CurStrVal += '\t'; 331 ++CurPtr; 332 break; 333 case 'n': 334 CurStrVal += '\n'; 335 ++CurPtr; 336 break; 337 338 case '\n': 339 case '\r': 340 return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); 341 342 // If we hit the end of the buffer, report an error. 343 case '\0': 344 if (CurPtr == CurBuf.end()) 345 return ReturnError(StrStart, "end of file in string literal"); 346 [[fallthrough]]; 347 default: 348 return ReturnError(CurPtr, "invalid escape in string literal"); 349 } 350 } 351 352 ++CurPtr; 353 return tgtok::StrVal; 354 } 355 356 tgtok::TokKind TGLexer::LexVarName() { 357 if (!isValidIDChar(CurPtr[0], /*First=*/true)) 358 return ReturnError(TokStart, "invalid variable name"); 359 360 // Otherwise, we're ok, consume the rest of the characters. 361 const char *VarNameStart = CurPtr++; 362 363 while (isValidIDChar(*CurPtr, /*First=*/false)) 364 ++CurPtr; 365 366 CurStrVal.assign(VarNameStart, CurPtr); 367 return tgtok::VarName; 368 } 369 370 tgtok::TokKind TGLexer::LexIdentifier() { 371 // The first letter is [a-zA-Z_]. 372 const char *IdentStart = TokStart; 373 374 // Match the rest of the identifier regex: [0-9a-zA-Z_]* 375 while (isValidIDChar(*CurPtr, /*First=*/false)) 376 ++CurPtr; 377 378 // Check to see if this identifier is a reserved keyword. 379 StringRef Str(IdentStart, CurPtr-IdentStart); 380 381 tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) 382 .Case("int", tgtok::Int) 383 .Case("bit", tgtok::Bit) 384 .Case("bits", tgtok::Bits) 385 .Case("string", tgtok::String) 386 .Case("list", tgtok::List) 387 .Case("code", tgtok::Code) 388 .Case("dag", tgtok::Dag) 389 .Case("class", tgtok::Class) 390 .Case("def", tgtok::Def) 391 .Case("true", tgtok::TrueVal) 392 .Case("false", tgtok::FalseVal) 393 .Case("foreach", tgtok::Foreach) 394 .Case("defm", tgtok::Defm) 395 .Case("defset", tgtok::Defset) 396 .Case("deftype", tgtok::Deftype) 397 .Case("multiclass", tgtok::MultiClass) 398 .Case("field", tgtok::Field) 399 .Case("let", tgtok::Let) 400 .Case("in", tgtok::In) 401 .Case("defvar", tgtok::Defvar) 402 .Case("include", tgtok::Include) 403 .Case("if", tgtok::If) 404 .Case("then", tgtok::Then) 405 .Case("else", tgtok::ElseKW) 406 .Case("assert", tgtok::Assert) 407 .Case("dump", tgtok::Dump) 408 .Default(tgtok::Id); 409 410 // A couple of tokens require special processing. 411 switch (Kind) { 412 case tgtok::Include: 413 if (LexInclude()) return tgtok::Error; 414 return Lex(); 415 case tgtok::Id: 416 CurStrVal.assign(Str.begin(), Str.end()); 417 break; 418 default: 419 break; 420 } 421 422 return Kind; 423 } 424 425 /// LexInclude - We just read the "include" token. Get the string token that 426 /// comes next and enter the include. 427 bool TGLexer::LexInclude() { 428 // The token after the include must be a string. 429 tgtok::TokKind Tok = LexToken(); 430 if (Tok == tgtok::Error) return true; 431 if (Tok != tgtok::StrVal) { 432 PrintError(getLoc(), "expected filename after include"); 433 return true; 434 } 435 436 // Get the string. 437 std::string Filename = CurStrVal; 438 std::string IncludedFile; 439 440 CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), 441 IncludedFile); 442 if (!CurBuffer) { 443 PrintError(getLoc(), "could not find include file '" + Filename + "'"); 444 return true; 445 } 446 447 Dependencies.insert(IncludedFile); 448 // Save the line number and lex buffer of the includer. 449 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 450 CurPtr = CurBuf.begin(); 451 452 PrepIncludeStack.emplace_back(); 453 return false; 454 } 455 456 /// SkipBCPLComment - Skip over the comment by finding the next CR or LF. 457 /// Or we may end up at the end of the buffer. 458 void TGLexer::SkipBCPLComment() { 459 ++CurPtr; // skip the second slash. 460 auto EOLPos = CurBuf.find_first_of("\r\n", CurPtr - CurBuf.data()); 461 CurPtr = (EOLPos == StringRef::npos) ? CurBuf.end() : CurBuf.data() + EOLPos; 462 } 463 464 /// SkipCComment - This skips C-style /**/ comments. The only difference from C 465 /// is that we allow nesting. 466 bool TGLexer::SkipCComment() { 467 ++CurPtr; // skip the star. 468 unsigned CommentDepth = 1; 469 470 while (true) { 471 int CurChar = getNextChar(); 472 switch (CurChar) { 473 case EOF: 474 PrintError(TokStart, "unterminated comment"); 475 return true; 476 case '*': 477 // End of the comment? 478 if (CurPtr[0] != '/') break; 479 480 ++CurPtr; // End the */. 481 if (--CommentDepth == 0) 482 return false; 483 break; 484 case '/': 485 // Start of a nested comment? 486 if (CurPtr[0] != '*') break; 487 ++CurPtr; 488 ++CommentDepth; 489 break; 490 } 491 } 492 } 493 494 /// LexNumber - Lex: 495 /// [-+]?[0-9]+ 496 /// 0x[0-9a-fA-F]+ 497 /// 0b[01]+ 498 tgtok::TokKind TGLexer::LexNumber() { 499 unsigned Base = 0; 500 const char *NumStart; 501 502 // Check if it's a hex or a binary value. 503 if (CurPtr[-1] == '0') { 504 NumStart = CurPtr + 1; 505 if (CurPtr[0] == 'x') { 506 Base = 16; 507 do 508 ++CurPtr; 509 while (isHexDigit(CurPtr[0])); 510 } else if (CurPtr[0] == 'b') { 511 Base = 2; 512 do 513 ++CurPtr; 514 while (CurPtr[0] == '0' || CurPtr[0] == '1'); 515 } 516 } 517 518 // For a hex or binary value, we always convert it to an unsigned value. 519 bool IsMinus = false; 520 521 // Check if it's a decimal value. 522 if (Base == 0) { 523 // Check for a sign without a digit. 524 if (!isDigit(CurPtr[0])) { 525 if (CurPtr[-1] == '-') 526 return tgtok::minus; 527 else if (CurPtr[-1] == '+') 528 return tgtok::plus; 529 } 530 531 Base = 10; 532 NumStart = TokStart; 533 IsMinus = CurPtr[-1] == '-'; 534 535 while (isDigit(CurPtr[0])) 536 ++CurPtr; 537 } 538 539 // Requires at least one digit. 540 if (CurPtr == NumStart) 541 return ReturnError(TokStart, "invalid number"); 542 543 errno = 0; 544 if (IsMinus) 545 CurIntVal = strtoll(NumStart, nullptr, Base); 546 else 547 CurIntVal = strtoull(NumStart, nullptr, Base); 548 549 if (errno == EINVAL) 550 return ReturnError(TokStart, "invalid number"); 551 if (errno == ERANGE) 552 return ReturnError(TokStart, "number out of range"); 553 554 return Base == 2 ? tgtok::BinaryIntVal : tgtok::IntVal; 555 } 556 557 /// LexBracket - We just read '['. If this is a code block, return it, 558 /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' 559 tgtok::TokKind TGLexer::LexBracket() { 560 if (CurPtr[0] != '{') 561 return tgtok::l_square; 562 ++CurPtr; 563 const char *CodeStart = CurPtr; 564 while (true) { 565 int Char = getNextChar(); 566 if (Char == EOF) break; 567 568 if (Char != '}') continue; 569 570 Char = getNextChar(); 571 if (Char == EOF) break; 572 if (Char == ']') { 573 CurStrVal.assign(CodeStart, CurPtr-2); 574 return tgtok::CodeFragment; 575 } 576 } 577 578 return ReturnError(CodeStart - 2, "unterminated code block"); 579 } 580 581 /// LexExclaim - Lex '!' and '![a-zA-Z]+'. 582 tgtok::TokKind TGLexer::LexExclaim() { 583 if (!isAlpha(*CurPtr)) 584 return ReturnError(CurPtr - 1, "invalid \"!operator\""); 585 586 const char *Start = CurPtr++; 587 while (isAlpha(*CurPtr)) 588 ++CurPtr; 589 590 // Check to see which operator this is. 591 tgtok::TokKind Kind = 592 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) 593 .Case("eq", tgtok::XEq) 594 .Case("ne", tgtok::XNe) 595 .Case("le", tgtok::XLe) 596 .Case("lt", tgtok::XLt) 597 .Case("ge", tgtok::XGe) 598 .Case("gt", tgtok::XGt) 599 .Case("if", tgtok::XIf) 600 .Case("cond", tgtok::XCond) 601 .Case("isa", tgtok::XIsA) 602 .Case("head", tgtok::XHead) 603 .Case("tail", tgtok::XTail) 604 .Case("size", tgtok::XSize) 605 .Case("con", tgtok::XConcat) 606 .Case("dag", tgtok::XDag) 607 .Case("add", tgtok::XADD) 608 .Case("sub", tgtok::XSUB) 609 .Case("mul", tgtok::XMUL) 610 .Case("div", tgtok::XDIV) 611 .Case("not", tgtok::XNOT) 612 .Case("logtwo", tgtok::XLOG2) 613 .Case("and", tgtok::XAND) 614 .Case("or", tgtok::XOR) 615 .Case("xor", tgtok::XXOR) 616 .Case("shl", tgtok::XSHL) 617 .Case("sra", tgtok::XSRA) 618 .Case("srl", tgtok::XSRL) 619 .Case("cast", tgtok::XCast) 620 .Case("empty", tgtok::XEmpty) 621 .Case("subst", tgtok::XSubst) 622 .Case("foldl", tgtok::XFoldl) 623 .Case("foreach", tgtok::XForEach) 624 .Case("filter", tgtok::XFilter) 625 .Case("listconcat", tgtok::XListConcat) 626 .Case("listflatten", tgtok::XListFlatten) 627 .Case("listsplat", tgtok::XListSplat) 628 .Case("listremove", tgtok::XListRemove) 629 .Case("range", tgtok::XRange) 630 .Case("strconcat", tgtok::XStrConcat) 631 .Case("initialized", tgtok::XInitialized) 632 .Case("interleave", tgtok::XInterleave) 633 .Case("instances", tgtok::XInstances) 634 .Case("substr", tgtok::XSubstr) 635 .Case("find", tgtok::XFind) 636 .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated. 637 .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated. 638 .Case("getdagarg", tgtok::XGetDagArg) 639 .Case("getdagname", tgtok::XGetDagName) 640 .Case("setdagarg", tgtok::XSetDagArg) 641 .Case("setdagname", tgtok::XSetDagName) 642 .Case("exists", tgtok::XExists) 643 .Case("tolower", tgtok::XToLower) 644 .Case("toupper", tgtok::XToUpper) 645 .Case("repr", tgtok::XRepr) 646 .Case("match", tgtok::XMatch) 647 .Default(tgtok::Error); 648 649 return Kind != tgtok::Error ? Kind 650 : ReturnError(Start - 1, "unknown operator"); 651 } 652 653 bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) { 654 // Report an error, if preprocessor control stack for the current 655 // file is not empty. 656 if (!PrepIncludeStack.back().empty()) { 657 prepReportPreprocessorStackError(); 658 659 return false; 660 } 661 662 // Pop the preprocessing controls from the include stack. 663 PrepIncludeStack.pop_back(); 664 665 if (IncludeStackMustBeEmpty) { 666 assert(PrepIncludeStack.empty() && 667 "preprocessor include stack is not empty"); 668 } else { 669 assert(!PrepIncludeStack.empty() && "preprocessor include stack is empty"); 670 } 671 672 return true; 673 } 674 675 tgtok::TokKind TGLexer::prepIsDirective() const { 676 for (const auto [Kind, Word] : PreprocessorDirs) { 677 if (StringRef(CurPtr, Word.size()) != Word) 678 continue; 679 int NextChar = peekNextChar(Word.size()); 680 681 // Check for whitespace after the directive. If there is no whitespace, 682 // then we do not recognize it as a preprocessing directive. 683 684 // New line and EOF may follow only #else/#endif. It will be reported 685 // as an error for #ifdef/#define after the call to prepLexMacroName(). 686 if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF || 687 NextChar == '\n' || 688 // It looks like TableGen does not support '\r' as the actual 689 // carriage return, e.g. getNextChar() treats a single '\r' 690 // as '\n'. So we do the same here. 691 NextChar == '\r') 692 return Kind; 693 694 // Allow comments after some directives, e.g.: 695 // #else// OR #else/**/ 696 // #endif// OR #endif/**/ 697 // 698 // Note that we do allow comments after #ifdef/#define here, e.g. 699 // #ifdef/**/ AND #ifdef// 700 // #define/**/ AND #define// 701 // 702 // These cases will be reported as incorrect after calling 703 // prepLexMacroName(). We could have supported C-style comments 704 // after #ifdef/#define, but this would complicate the code 705 // for little benefit. 706 if (NextChar == '/') { 707 NextChar = peekNextChar(Word.size() + 1); 708 709 if (NextChar == '*' || NextChar == '/') 710 return Kind; 711 712 // Pretend that we do not recognize the directive. 713 } 714 } 715 716 return tgtok::Error; 717 } 718 719 void TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) { 720 TokStart = CurPtr; 721 722 for (const auto [PKind, PWord] : PreprocessorDirs) { 723 if (PKind == Kind) { 724 // Advance CurPtr to the end of the preprocessing word. 725 CurPtr += PWord.size(); 726 return; 727 } 728 } 729 730 llvm_unreachable( 731 "unsupported preprocessing token in prepEatPreprocessorDirective()"); 732 } 733 734 tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, 735 bool ReturnNextLiveToken) { 736 // We must be looking at a preprocessing directive. Eat it! 737 prepEatPreprocessorDirective(Kind); 738 739 if (Kind == tgtok::Ifdef || Kind == tgtok::Ifndef) { 740 StringRef MacroName = prepLexMacroName(); 741 StringRef IfTokName = Kind == tgtok::Ifdef ? "#ifdef" : "#ifndef"; 742 if (MacroName.empty()) 743 return ReturnError(TokStart, "expected macro name after " + IfTokName); 744 745 bool MacroIsDefined = DefinedMacros.count(MacroName) != 0; 746 747 // Canonicalize ifndef's MacroIsDefined to its ifdef equivalent. 748 if (Kind == tgtok::Ifndef) 749 MacroIsDefined = !MacroIsDefined; 750 751 // Regardless of whether we are processing tokens or not, 752 // we put the #ifdef control on stack. 753 // Note that MacroIsDefined has been canonicalized against ifdef. 754 PrepIncludeStack.back().push_back( 755 {tgtok::Ifdef, MacroIsDefined, SMLoc::getFromPointer(TokStart)}); 756 757 if (!prepSkipDirectiveEnd()) 758 return ReturnError(CurPtr, "only comments are supported after " + 759 IfTokName + " NAME"); 760 761 // If we were not processing tokens before this #ifdef, 762 // then just return back to the lines skipping code. 763 if (!ReturnNextLiveToken) 764 return Kind; 765 766 // If we were processing tokens before this #ifdef, 767 // and the macro is defined, then just return the next token. 768 if (MacroIsDefined) 769 return LexToken(); 770 771 // We were processing tokens before this #ifdef, and the macro 772 // is not defined, so we have to start skipping the lines. 773 // If the skipping is successful, it will return the token following 774 // either #else or #endif corresponding to this #ifdef. 775 if (prepSkipRegion(ReturnNextLiveToken)) 776 return LexToken(); 777 778 return tgtok::Error; 779 } else if (Kind == tgtok::Else) { 780 // Check if this #else is correct before calling prepSkipDirectiveEnd(), 781 // which will move CurPtr away from the beginning of #else. 782 if (PrepIncludeStack.back().empty()) 783 return ReturnError(TokStart, "#else without #ifdef or #ifndef"); 784 785 PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back().back(); 786 787 if (IfdefEntry.Kind != tgtok::Ifdef) { 788 PrintError(TokStart, "double #else"); 789 return ReturnError(IfdefEntry.SrcPos, "previous #else is here"); 790 } 791 792 // Replace the corresponding #ifdef's control with its negation 793 // on the control stack. 794 PrepIncludeStack.back().back() = {Kind, !IfdefEntry.IsDefined, 795 SMLoc::getFromPointer(TokStart)}; 796 797 if (!prepSkipDirectiveEnd()) 798 return ReturnError(CurPtr, "only comments are supported after #else"); 799 800 // If we were processing tokens before this #else, 801 // we have to start skipping lines until the matching #endif. 802 if (ReturnNextLiveToken) { 803 if (prepSkipRegion(ReturnNextLiveToken)) 804 return LexToken(); 805 806 return tgtok::Error; 807 } 808 809 // Return to the lines skipping code. 810 return Kind; 811 } else if (Kind == tgtok::Endif) { 812 // Check if this #endif is correct before calling prepSkipDirectiveEnd(), 813 // which will move CurPtr away from the beginning of #endif. 814 if (PrepIncludeStack.back().empty()) 815 return ReturnError(TokStart, "#endif without #ifdef"); 816 817 [[maybe_unused]] auto &IfdefOrElseEntry = PrepIncludeStack.back().back(); 818 819 assert((IfdefOrElseEntry.Kind == tgtok::Ifdef || 820 IfdefOrElseEntry.Kind == tgtok::Else) && 821 "invalid preprocessor control on the stack"); 822 823 if (!prepSkipDirectiveEnd()) 824 return ReturnError(CurPtr, "only comments are supported after #endif"); 825 826 PrepIncludeStack.back().pop_back(); 827 828 // If we were processing tokens before this #endif, then 829 // we should continue it. 830 if (ReturnNextLiveToken) { 831 return LexToken(); 832 } 833 834 // Return to the lines skipping code. 835 return Kind; 836 } else if (Kind == tgtok::Define) { 837 StringRef MacroName = prepLexMacroName(); 838 if (MacroName.empty()) 839 return ReturnError(TokStart, "expected macro name after #define"); 840 841 if (!DefinedMacros.insert(MacroName).second) 842 PrintWarning(getLoc(), 843 "duplicate definition of macro: " + Twine(MacroName)); 844 845 if (!prepSkipDirectiveEnd()) 846 return ReturnError(CurPtr, 847 "only comments are supported after #define NAME"); 848 849 assert(ReturnNextLiveToken && 850 "#define must be ignored during the lines skipping"); 851 852 return LexToken(); 853 } 854 855 llvm_unreachable("preprocessing directive is not supported"); 856 } 857 858 bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) { 859 assert(MustNeverBeFalse && "invalid recursion."); 860 861 do { 862 // Skip all symbols to the line end. 863 while (*CurPtr != '\n') 864 ++CurPtr; 865 866 // Find the first non-whitespace symbol in the next line(s). 867 if (!prepSkipLineBegin()) 868 return false; 869 870 // If the first non-blank/comment symbol on the line is '#', 871 // it may be a start of preprocessing directive. 872 // 873 // If it is not '#' just go to the next line. 874 if (*CurPtr == '#') 875 ++CurPtr; 876 else 877 continue; 878 879 tgtok::TokKind Kind = prepIsDirective(); 880 881 // If we did not find a preprocessing directive or it is #define, 882 // then just skip to the next line. We do not have to do anything 883 // for #define in the line-skipping mode. 884 if (Kind == tgtok::Error || Kind == tgtok::Define) 885 continue; 886 887 tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, false); 888 889 // If lexPreprocessor() encountered an error during lexing this 890 // preprocessor idiom, then return false to the calling lexPreprocessor(). 891 // This will force tgtok::Error to be returned to the tokens processing. 892 if (ProcessedKind == tgtok::Error) 893 return false; 894 895 assert(Kind == ProcessedKind && "prepIsDirective() and lexPreprocessor() " 896 "returned different token kinds"); 897 898 // If this preprocessing directive enables tokens processing, 899 // then return to the lexPreprocessor() and get to the next token. 900 // We can move from line-skipping mode to processing tokens only 901 // due to #else or #endif. 902 if (prepIsProcessingEnabled()) { 903 assert((Kind == tgtok::Else || Kind == tgtok::Endif) && 904 "tokens processing was enabled by an unexpected preprocessing " 905 "directive"); 906 907 return true; 908 } 909 } while (CurPtr != CurBuf.end()); 910 911 // We have reached the end of the file, but never left the lines-skipping 912 // mode. This means there is no matching #endif. 913 prepReportPreprocessorStackError(); 914 return false; 915 } 916 917 StringRef TGLexer::prepLexMacroName() { 918 // Skip whitespaces between the preprocessing directive and the macro name. 919 while (*CurPtr == ' ' || *CurPtr == '\t') 920 ++CurPtr; 921 922 TokStart = CurPtr; 923 CurPtr = lexMacroName(StringRef(CurPtr, CurBuf.end() - CurPtr)); 924 return StringRef(TokStart, CurPtr - TokStart); 925 } 926 927 bool TGLexer::prepSkipLineBegin() { 928 while (CurPtr != CurBuf.end()) { 929 switch (*CurPtr) { 930 case ' ': 931 case '\t': 932 case '\n': 933 case '\r': 934 break; 935 936 case '/': { 937 int NextChar = peekNextChar(1); 938 if (NextChar == '*') { 939 // Skip C-style comment. 940 // Note that we do not care about skipping the C++-style comments. 941 // If the line contains "//", it may not contain any processable 942 // preprocessing directive. Just return CurPtr pointing to 943 // the first '/' in this case. We also do not care about 944 // incorrect symbols after the first '/' - we are in lines-skipping 945 // mode, so incorrect code is allowed to some extent. 946 947 // Set TokStart to the beginning of the comment to enable proper 948 // diagnostic printing in case of error in SkipCComment(). 949 TokStart = CurPtr; 950 951 // CurPtr must point to '*' before call to SkipCComment(). 952 ++CurPtr; 953 if (SkipCComment()) 954 return false; 955 } else { 956 // CurPtr points to the non-whitespace '/'. 957 return true; 958 } 959 960 // We must not increment CurPtr after the comment was lexed. 961 continue; 962 } 963 964 default: 965 return true; 966 } 967 968 ++CurPtr; 969 } 970 971 // We have reached the end of the file. Return to the lines skipping 972 // code, and allow it to handle the EOF as needed. 973 return true; 974 } 975 976 bool TGLexer::prepSkipDirectiveEnd() { 977 while (CurPtr != CurBuf.end()) { 978 switch (*CurPtr) { 979 case ' ': 980 case '\t': 981 break; 982 983 case '\n': 984 case '\r': 985 return true; 986 987 case '/': { 988 int NextChar = peekNextChar(1); 989 if (NextChar == '/') { 990 // Skip C++-style comment. 991 // We may just return true now, but let's skip to the line/buffer end 992 // to simplify the method specification. 993 ++CurPtr; 994 SkipBCPLComment(); 995 } else if (NextChar == '*') { 996 // When we are skipping C-style comment at the end of a preprocessing 997 // directive, we can skip several lines. If any meaningful TD token 998 // follows the end of the C-style comment on the same line, it will 999 // be considered as an invalid usage of TD token. 1000 // For example, we want to forbid usages like this one: 1001 // #define MACRO class Class {} 1002 // But with C-style comments we also disallow the following: 1003 // #define MACRO /* This macro is used 1004 // to ... */ class Class {} 1005 // One can argue that this should be allowed, but it does not seem 1006 // to be worth of the complication. Moreover, this matches 1007 // the C preprocessor behavior. 1008 1009 // Set TokStart to the beginning of the comment to enable proper 1010 // diagnostic printer in case of error in SkipCComment(). 1011 TokStart = CurPtr; 1012 ++CurPtr; 1013 if (SkipCComment()) 1014 return false; 1015 } else { 1016 TokStart = CurPtr; 1017 PrintError(CurPtr, "unexpected character"); 1018 return false; 1019 } 1020 1021 // We must not increment CurPtr after the comment was lexed. 1022 continue; 1023 } 1024 1025 default: 1026 // Do not allow any non-whitespaces after the directive. 1027 TokStart = CurPtr; 1028 return false; 1029 } 1030 1031 ++CurPtr; 1032 } 1033 1034 return true; 1035 } 1036 1037 bool TGLexer::prepIsProcessingEnabled() { 1038 return all_of(PrepIncludeStack.back(), 1039 [](const PreprocessorControlDesc &I) { return I.IsDefined; }); 1040 } 1041 1042 void TGLexer::prepReportPreprocessorStackError() { 1043 auto &PrepControl = PrepIncludeStack.back().back(); 1044 PrintError(CurBuf.end(), "reached EOF without matching #endif"); 1045 PrintError(PrepControl.SrcPos, "the latest preprocessor control is here"); 1046 1047 TokStart = CurPtr; 1048 } 1049