1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implement the Lexer for TableGen. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "TGLexer.h" 14 #include "llvm/ADT/StringSwitch.h" 15 #include "llvm/ADT/Twine.h" 16 #include "llvm/Config/config.h" // for strtoull()/strtoll() define 17 #include "llvm/Support/Compiler.h" 18 #include "llvm/Support/MemoryBuffer.h" 19 #include "llvm/Support/SourceMgr.h" 20 #include "llvm/TableGen/Error.h" 21 #include <algorithm> 22 #include <cctype> 23 #include <cerrno> 24 #include <cstdint> 25 #include <cstdio> 26 #include <cstdlib> 27 #include <cstring> 28 29 using namespace llvm; 30 31 namespace { 32 // A list of supported preprocessing directives with their 33 // internal token kinds and names. 34 struct { 35 tgtok::TokKind Kind; 36 const char *Word; 37 } PreprocessorDirs[] = { 38 { tgtok::Ifdef, "ifdef" }, 39 { tgtok::Ifndef, "ifndef" }, 40 { tgtok::Else, "else" }, 41 { tgtok::Endif, "endif" }, 42 { tgtok::Define, "define" } 43 }; 44 } // end anonymous namespace 45 46 TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) { 47 CurBuffer = SrcMgr.getMainFileID(); 48 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 49 CurPtr = CurBuf.begin(); 50 TokStart = nullptr; 51 52 // Pretend that we enter the "top-level" include file. 53 PrepIncludeStack.push_back( 54 make_unique<std::vector<PreprocessorControlDesc>>()); 55 56 // Put all macros defined in the command line into the DefinedMacros set. 57 std::for_each(Macros.begin(), Macros.end(), 58 [this](const std::string &MacroName) { 59 DefinedMacros.insert(MacroName); 60 }); 61 } 62 63 SMLoc TGLexer::getLoc() const { 64 return SMLoc::getFromPointer(TokStart); 65 } 66 67 /// ReturnError - Set the error to the specified string at the specified 68 /// location. This is defined to always return tgtok::Error. 69 tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) { 70 PrintError(Loc, Msg); 71 return tgtok::Error; 72 } 73 74 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { 75 return ReturnError(SMLoc::getFromPointer(Loc), Msg); 76 } 77 78 bool TGLexer::processEOF() { 79 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); 80 if (ParentIncludeLoc != SMLoc()) { 81 // If prepExitInclude() detects a problem with the preprocessing 82 // control stack, it will return false. Pretend that we reached 83 // the final EOF and stop lexing more tokens by returning false 84 // to LexToken(). 85 if (!prepExitInclude(false)) 86 return false; 87 88 CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); 89 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 90 CurPtr = ParentIncludeLoc.getPointer(); 91 // Make sure TokStart points into the parent file's buffer. 92 // LexToken() assigns to it before calling getNextChar(), 93 // so it is pointing into the included file now. 94 TokStart = CurPtr; 95 return true; 96 } 97 98 // Pretend that we exit the "top-level" include file. 99 // Note that in case of an error (e.g. control stack imbalance) 100 // the routine will issue a fatal error. 101 prepExitInclude(true); 102 return false; 103 } 104 105 int TGLexer::getNextChar() { 106 char CurChar = *CurPtr++; 107 switch (CurChar) { 108 default: 109 return (unsigned char)CurChar; 110 case 0: { 111 // A nul character in the stream is either the end of the current buffer or 112 // a random nul in the file. Disambiguate that here. 113 if (CurPtr-1 != CurBuf.end()) 114 return 0; // Just whitespace. 115 116 // Otherwise, return end of file. 117 --CurPtr; // Another call to lex will return EOF again. 118 return EOF; 119 } 120 case '\n': 121 case '\r': 122 // Handle the newline character by ignoring it and incrementing the line 123 // count. However, be careful about 'dos style' files with \n\r in them. 124 // Only treat a \n\r or \r\n as a single line. 125 if ((*CurPtr == '\n' || (*CurPtr == '\r')) && 126 *CurPtr != CurChar) 127 ++CurPtr; // Eat the two char newline sequence. 128 return '\n'; 129 } 130 } 131 132 int TGLexer::peekNextChar(int Index) const { 133 return *(CurPtr + Index); 134 } 135 136 tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) { 137 TokStart = CurPtr; 138 // This always consumes at least one character. 139 int CurChar = getNextChar(); 140 141 switch (CurChar) { 142 default: 143 // Handle letters: [a-zA-Z_] 144 if (isalpha(CurChar) || CurChar == '_') 145 return LexIdentifier(); 146 147 // Unknown character, emit an error. 148 return ReturnError(TokStart, "Unexpected character"); 149 case EOF: 150 // Lex next token, if we just left an include file. 151 // Note that leaving an include file means that the next 152 // symbol is located at the end of 'include "..."' 153 // construct, so LexToken() is called with default 154 // false parameter. 155 if (processEOF()) 156 return LexToken(); 157 158 // Return EOF denoting the end of lexing. 159 return tgtok::Eof; 160 161 case ':': return tgtok::colon; 162 case ';': return tgtok::semi; 163 case '.': return tgtok::period; 164 case ',': return tgtok::comma; 165 case '<': return tgtok::less; 166 case '>': return tgtok::greater; 167 case ']': return tgtok::r_square; 168 case '{': return tgtok::l_brace; 169 case '}': return tgtok::r_brace; 170 case '(': return tgtok::l_paren; 171 case ')': return tgtok::r_paren; 172 case '=': return tgtok::equal; 173 case '?': return tgtok::question; 174 case '#': 175 if (FileOrLineStart) { 176 tgtok::TokKind Kind = prepIsDirective(); 177 if (Kind != tgtok::Error) 178 return lexPreprocessor(Kind); 179 } 180 181 return tgtok::paste; 182 183 case '\r': 184 PrintFatalError("getNextChar() must never return '\r'"); 185 return tgtok::Error; 186 187 case 0: 188 case ' ': 189 case '\t': 190 // Ignore whitespace. 191 return LexToken(FileOrLineStart); 192 case '\n': 193 // Ignore whitespace, and identify the new line. 194 return LexToken(true); 195 case '/': 196 // If this is the start of a // comment, skip until the end of the line or 197 // the end of the buffer. 198 if (*CurPtr == '/') 199 SkipBCPLComment(); 200 else if (*CurPtr == '*') { 201 if (SkipCComment()) 202 return tgtok::Error; 203 } else // Otherwise, this is an error. 204 return ReturnError(TokStart, "Unexpected character"); 205 return LexToken(FileOrLineStart); 206 case '-': case '+': 207 case '0': case '1': case '2': case '3': case '4': case '5': case '6': 208 case '7': case '8': case '9': { 209 int NextChar = 0; 210 if (isdigit(CurChar)) { 211 // Allow identifiers to start with a number if it is followed by 212 // an identifier. This can happen with paste operations like 213 // foo#8i. 214 int i = 0; 215 do { 216 NextChar = peekNextChar(i++); 217 } while (isdigit(NextChar)); 218 219 if (NextChar == 'x' || NextChar == 'b') { 220 // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most 221 // likely a number. 222 int NextNextChar = peekNextChar(i); 223 switch (NextNextChar) { 224 default: 225 break; 226 case '0': case '1': 227 if (NextChar == 'b') 228 return LexNumber(); 229 LLVM_FALLTHROUGH; 230 case '2': case '3': case '4': case '5': 231 case '6': case '7': case '8': case '9': 232 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 233 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 234 if (NextChar == 'x') 235 return LexNumber(); 236 break; 237 } 238 } 239 } 240 241 if (isalpha(NextChar) || NextChar == '_') 242 return LexIdentifier(); 243 244 return LexNumber(); 245 } 246 case '"': return LexString(); 247 case '$': return LexVarName(); 248 case '[': return LexBracket(); 249 case '!': return LexExclaim(); 250 } 251 } 252 253 /// LexString - Lex "[^"]*" 254 tgtok::TokKind TGLexer::LexString() { 255 const char *StrStart = CurPtr; 256 257 CurStrVal = ""; 258 259 while (*CurPtr != '"') { 260 // If we hit the end of the buffer, report an error. 261 if (*CurPtr == 0 && CurPtr == CurBuf.end()) 262 return ReturnError(StrStart, "End of file in string literal"); 263 264 if (*CurPtr == '\n' || *CurPtr == '\r') 265 return ReturnError(StrStart, "End of line in string literal"); 266 267 if (*CurPtr != '\\') { 268 CurStrVal += *CurPtr++; 269 continue; 270 } 271 272 ++CurPtr; 273 274 switch (*CurPtr) { 275 case '\\': case '\'': case '"': 276 // These turn into their literal character. 277 CurStrVal += *CurPtr++; 278 break; 279 case 't': 280 CurStrVal += '\t'; 281 ++CurPtr; 282 break; 283 case 'n': 284 CurStrVal += '\n'; 285 ++CurPtr; 286 break; 287 288 case '\n': 289 case '\r': 290 return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); 291 292 // If we hit the end of the buffer, report an error. 293 case '\0': 294 if (CurPtr == CurBuf.end()) 295 return ReturnError(StrStart, "End of file in string literal"); 296 LLVM_FALLTHROUGH; 297 default: 298 return ReturnError(CurPtr, "invalid escape in string literal"); 299 } 300 } 301 302 ++CurPtr; 303 return tgtok::StrVal; 304 } 305 306 tgtok::TokKind TGLexer::LexVarName() { 307 if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') 308 return ReturnError(TokStart, "Invalid variable name"); 309 310 // Otherwise, we're ok, consume the rest of the characters. 311 const char *VarNameStart = CurPtr++; 312 313 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 314 ++CurPtr; 315 316 CurStrVal.assign(VarNameStart, CurPtr); 317 return tgtok::VarName; 318 } 319 320 tgtok::TokKind TGLexer::LexIdentifier() { 321 // The first letter is [a-zA-Z_]. 322 const char *IdentStart = TokStart; 323 324 // Match the rest of the identifier regex: [0-9a-zA-Z_]* 325 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 326 ++CurPtr; 327 328 // Check to see if this identifier is a keyword. 329 StringRef Str(IdentStart, CurPtr-IdentStart); 330 331 if (Str == "include") { 332 if (LexInclude()) return tgtok::Error; 333 return Lex(); 334 } 335 336 tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) 337 .Case("int", tgtok::Int) 338 .Case("bit", tgtok::Bit) 339 .Case("bits", tgtok::Bits) 340 .Case("string", tgtok::String) 341 .Case("list", tgtok::List) 342 .Case("code", tgtok::Code) 343 .Case("dag", tgtok::Dag) 344 .Case("class", tgtok::Class) 345 .Case("def", tgtok::Def) 346 .Case("foreach", tgtok::Foreach) 347 .Case("defm", tgtok::Defm) 348 .Case("defset", tgtok::Defset) 349 .Case("multiclass", tgtok::MultiClass) 350 .Case("field", tgtok::Field) 351 .Case("let", tgtok::Let) 352 .Case("in", tgtok::In) 353 .Default(tgtok::Id); 354 355 if (Kind == tgtok::Id) 356 CurStrVal.assign(Str.begin(), Str.end()); 357 return Kind; 358 } 359 360 /// LexInclude - We just read the "include" token. Get the string token that 361 /// comes next and enter the include. 362 bool TGLexer::LexInclude() { 363 // The token after the include must be a string. 364 tgtok::TokKind Tok = LexToken(); 365 if (Tok == tgtok::Error) return true; 366 if (Tok != tgtok::StrVal) { 367 PrintError(getLoc(), "Expected filename after include"); 368 return true; 369 } 370 371 // Get the string. 372 std::string Filename = CurStrVal; 373 std::string IncludedFile; 374 375 CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), 376 IncludedFile); 377 if (!CurBuffer) { 378 PrintError(getLoc(), "Could not find include file '" + Filename + "'"); 379 return true; 380 } 381 382 DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile); 383 if (Found != Dependencies.end()) { 384 PrintError(getLoc(), 385 "File '" + IncludedFile + "' has already been included."); 386 SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note, 387 "previously included here"); 388 return true; 389 } 390 Dependencies.insert(std::make_pair(IncludedFile, getLoc())); 391 // Save the line number and lex buffer of the includer. 392 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 393 CurPtr = CurBuf.begin(); 394 395 PrepIncludeStack.push_back( 396 make_unique<std::vector<PreprocessorControlDesc>>()); 397 return false; 398 } 399 400 void TGLexer::SkipBCPLComment() { 401 ++CurPtr; // skip the second slash. 402 while (true) { 403 switch (*CurPtr) { 404 case '\n': 405 case '\r': 406 return; // Newline is end of comment. 407 case 0: 408 // If this is the end of the buffer, end the comment. 409 if (CurPtr == CurBuf.end()) 410 return; 411 break; 412 } 413 // Otherwise, skip the character. 414 ++CurPtr; 415 } 416 } 417 418 /// SkipCComment - This skips C-style /**/ comments. The only difference from C 419 /// is that we allow nesting. 420 bool TGLexer::SkipCComment() { 421 ++CurPtr; // skip the star. 422 unsigned CommentDepth = 1; 423 424 while (true) { 425 int CurChar = getNextChar(); 426 switch (CurChar) { 427 case EOF: 428 PrintError(TokStart, "Unterminated comment!"); 429 return true; 430 case '*': 431 // End of the comment? 432 if (CurPtr[0] != '/') break; 433 434 ++CurPtr; // End the */. 435 if (--CommentDepth == 0) 436 return false; 437 break; 438 case '/': 439 // Start of a nested comment? 440 if (CurPtr[0] != '*') break; 441 ++CurPtr; 442 ++CommentDepth; 443 break; 444 } 445 } 446 } 447 448 /// LexNumber - Lex: 449 /// [-+]?[0-9]+ 450 /// 0x[0-9a-fA-F]+ 451 /// 0b[01]+ 452 tgtok::TokKind TGLexer::LexNumber() { 453 if (CurPtr[-1] == '0') { 454 if (CurPtr[0] == 'x') { 455 ++CurPtr; 456 const char *NumStart = CurPtr; 457 while (isxdigit(CurPtr[0])) 458 ++CurPtr; 459 460 // Requires at least one hex digit. 461 if (CurPtr == NumStart) 462 return ReturnError(TokStart, "Invalid hexadecimal number"); 463 464 errno = 0; 465 CurIntVal = strtoll(NumStart, nullptr, 16); 466 if (errno == EINVAL) 467 return ReturnError(TokStart, "Invalid hexadecimal number"); 468 if (errno == ERANGE) { 469 errno = 0; 470 CurIntVal = (int64_t)strtoull(NumStart, nullptr, 16); 471 if (errno == EINVAL) 472 return ReturnError(TokStart, "Invalid hexadecimal number"); 473 if (errno == ERANGE) 474 return ReturnError(TokStart, "Hexadecimal number out of range"); 475 } 476 return tgtok::IntVal; 477 } else if (CurPtr[0] == 'b') { 478 ++CurPtr; 479 const char *NumStart = CurPtr; 480 while (CurPtr[0] == '0' || CurPtr[0] == '1') 481 ++CurPtr; 482 483 // Requires at least one binary digit. 484 if (CurPtr == NumStart) 485 return ReturnError(CurPtr-2, "Invalid binary number"); 486 CurIntVal = strtoll(NumStart, nullptr, 2); 487 return tgtok::BinaryIntVal; 488 } 489 } 490 491 // Check for a sign without a digit. 492 if (!isdigit(CurPtr[0])) { 493 if (CurPtr[-1] == '-') 494 return tgtok::minus; 495 else if (CurPtr[-1] == '+') 496 return tgtok::plus; 497 } 498 499 while (isdigit(CurPtr[0])) 500 ++CurPtr; 501 CurIntVal = strtoll(TokStart, nullptr, 10); 502 return tgtok::IntVal; 503 } 504 505 /// LexBracket - We just read '['. If this is a code block, return it, 506 /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' 507 tgtok::TokKind TGLexer::LexBracket() { 508 if (CurPtr[0] != '{') 509 return tgtok::l_square; 510 ++CurPtr; 511 const char *CodeStart = CurPtr; 512 while (true) { 513 int Char = getNextChar(); 514 if (Char == EOF) break; 515 516 if (Char != '}') continue; 517 518 Char = getNextChar(); 519 if (Char == EOF) break; 520 if (Char == ']') { 521 CurStrVal.assign(CodeStart, CurPtr-2); 522 return tgtok::CodeFragment; 523 } 524 } 525 526 return ReturnError(CodeStart-2, "Unterminated Code Block"); 527 } 528 529 /// LexExclaim - Lex '!' and '![a-zA-Z]+'. 530 tgtok::TokKind TGLexer::LexExclaim() { 531 if (!isalpha(*CurPtr)) 532 return ReturnError(CurPtr - 1, "Invalid \"!operator\""); 533 534 const char *Start = CurPtr++; 535 while (isalpha(*CurPtr)) 536 ++CurPtr; 537 538 // Check to see which operator this is. 539 tgtok::TokKind Kind = 540 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) 541 .Case("eq", tgtok::XEq) 542 .Case("ne", tgtok::XNe) 543 .Case("le", tgtok::XLe) 544 .Case("lt", tgtok::XLt) 545 .Case("ge", tgtok::XGe) 546 .Case("gt", tgtok::XGt) 547 .Case("if", tgtok::XIf) 548 .Case("cond", tgtok::XCond) 549 .Case("isa", tgtok::XIsA) 550 .Case("head", tgtok::XHead) 551 .Case("tail", tgtok::XTail) 552 .Case("size", tgtok::XSize) 553 .Case("con", tgtok::XConcat) 554 .Case("dag", tgtok::XDag) 555 .Case("add", tgtok::XADD) 556 .Case("mul", tgtok::XMUL) 557 .Case("and", tgtok::XAND) 558 .Case("or", tgtok::XOR) 559 .Case("shl", tgtok::XSHL) 560 .Case("sra", tgtok::XSRA) 561 .Case("srl", tgtok::XSRL) 562 .Case("cast", tgtok::XCast) 563 .Case("empty", tgtok::XEmpty) 564 .Case("subst", tgtok::XSubst) 565 .Case("foldl", tgtok::XFoldl) 566 .Case("foreach", tgtok::XForEach) 567 .Case("listconcat", tgtok::XListConcat) 568 .Case("listsplat", tgtok::XListSplat) 569 .Case("strconcat", tgtok::XStrConcat) 570 .Default(tgtok::Error); 571 572 return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); 573 } 574 575 bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) { 576 // Report an error, if preprocessor control stack for the current 577 // file is not empty. 578 if (!PrepIncludeStack.back()->empty()) { 579 prepReportPreprocessorStackError(); 580 581 return false; 582 } 583 584 // Pop the preprocessing controls from the include stack. 585 if (PrepIncludeStack.empty()) { 586 PrintFatalError("Preprocessor include stack is empty"); 587 } 588 589 PrepIncludeStack.pop_back(); 590 591 if (IncludeStackMustBeEmpty) { 592 if (!PrepIncludeStack.empty()) 593 PrintFatalError("Preprocessor include stack is not empty"); 594 } else { 595 if (PrepIncludeStack.empty()) 596 PrintFatalError("Preprocessor include stack is empty"); 597 } 598 599 return true; 600 } 601 602 tgtok::TokKind TGLexer::prepIsDirective() const { 603 for (unsigned ID = 0; ID < llvm::array_lengthof(PreprocessorDirs); ++ID) { 604 int NextChar = *CurPtr; 605 bool Match = true; 606 unsigned I = 0; 607 for (; I < strlen(PreprocessorDirs[ID].Word); ++I) { 608 if (NextChar != PreprocessorDirs[ID].Word[I]) { 609 Match = false; 610 break; 611 } 612 613 NextChar = peekNextChar(I + 1); 614 } 615 616 // Check for whitespace after the directive. If there is no whitespace, 617 // then we do not recognize it as a preprocessing directive. 618 if (Match) { 619 tgtok::TokKind Kind = PreprocessorDirs[ID].Kind; 620 621 // New line and EOF may follow only #else/#endif. It will be reported 622 // as an error for #ifdef/#define after the call to prepLexMacroName(). 623 if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF || 624 NextChar == '\n' || 625 // It looks like TableGen does not support '\r' as the actual 626 // carriage return, e.g. getNextChar() treats a single '\r' 627 // as '\n'. So we do the same here. 628 NextChar == '\r') 629 return Kind; 630 631 // Allow comments after some directives, e.g.: 632 // #else// OR #else/**/ 633 // #endif// OR #endif/**/ 634 // 635 // Note that we do allow comments after #ifdef/#define here, e.g. 636 // #ifdef/**/ AND #ifdef// 637 // #define/**/ AND #define// 638 // 639 // These cases will be reported as incorrect after calling 640 // prepLexMacroName(). We could have supported C-style comments 641 // after #ifdef/#define, but this would complicate the code 642 // for little benefit. 643 if (NextChar == '/') { 644 NextChar = peekNextChar(I + 1); 645 646 if (NextChar == '*' || NextChar == '/') 647 return Kind; 648 649 // Pretend that we do not recognize the directive. 650 } 651 } 652 } 653 654 return tgtok::Error; 655 } 656 657 bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) { 658 TokStart = CurPtr; 659 660 for (unsigned ID = 0; ID < llvm::array_lengthof(PreprocessorDirs); ++ID) 661 if (PreprocessorDirs[ID].Kind == Kind) { 662 // Advance CurPtr to the end of the preprocessing word. 663 CurPtr += strlen(PreprocessorDirs[ID].Word); 664 return true; 665 } 666 667 PrintFatalError("Unsupported preprocessing token in " 668 "prepEatPreprocessorDirective()"); 669 return false; 670 } 671 672 tgtok::TokKind TGLexer::lexPreprocessor( 673 tgtok::TokKind Kind, bool ReturnNextLiveToken) { 674 675 // We must be looking at a preprocessing directive. Eat it! 676 if (!prepEatPreprocessorDirective(Kind)) 677 PrintFatalError("lexPreprocessor() called for unknown " 678 "preprocessor directive"); 679 680 if (Kind == tgtok::Ifdef || Kind == tgtok::Ifndef) { 681 StringRef MacroName = prepLexMacroName(); 682 StringRef IfTokName = Kind == tgtok::Ifdef ? "#ifdef" : "#ifndef"; 683 if (MacroName.empty()) 684 return ReturnError(TokStart, "Expected macro name after " + IfTokName); 685 686 bool MacroIsDefined = DefinedMacros.count(MacroName) != 0; 687 688 // Canonicalize ifndef to ifdef equivalent 689 if (Kind == tgtok::Ifndef) { 690 MacroIsDefined = !MacroIsDefined; 691 Kind = tgtok::Ifdef; 692 } 693 694 // Regardless of whether we are processing tokens or not, 695 // we put the #ifdef control on stack. 696 PrepIncludeStack.back()->push_back( 697 {Kind, MacroIsDefined, SMLoc::getFromPointer(TokStart)}); 698 699 if (!prepSkipDirectiveEnd()) 700 return ReturnError(CurPtr, "Only comments are supported after " + 701 IfTokName + " NAME"); 702 703 // If we were not processing tokens before this #ifdef, 704 // then just return back to the lines skipping code. 705 if (!ReturnNextLiveToken) 706 return Kind; 707 708 // If we were processing tokens before this #ifdef, 709 // and the macro is defined, then just return the next token. 710 if (MacroIsDefined) 711 return LexToken(); 712 713 // We were processing tokens before this #ifdef, and the macro 714 // is not defined, so we have to start skipping the lines. 715 // If the skipping is successful, it will return the token following 716 // either #else or #endif corresponding to this #ifdef. 717 if (prepSkipRegion(ReturnNextLiveToken)) 718 return LexToken(); 719 720 return tgtok::Error; 721 } else if (Kind == tgtok::Else) { 722 // Check if this #else is correct before calling prepSkipDirectiveEnd(), 723 // which will move CurPtr away from the beginning of #else. 724 if (PrepIncludeStack.back()->empty()) 725 return ReturnError(TokStart, "#else without #ifdef or #ifndef"); 726 727 PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back()->back(); 728 729 if (IfdefEntry.Kind != tgtok::Ifdef) { 730 PrintError(TokStart, "double #else"); 731 return ReturnError(IfdefEntry.SrcPos, "Previous #else is here"); 732 } 733 734 // Replace the corresponding #ifdef's control with its negation 735 // on the control stack. 736 PrepIncludeStack.back()->pop_back(); 737 PrepIncludeStack.back()->push_back( 738 {Kind, !IfdefEntry.IsDefined, SMLoc::getFromPointer(TokStart)}); 739 740 if (!prepSkipDirectiveEnd()) 741 return ReturnError(CurPtr, "Only comments are supported after #else"); 742 743 // If we were processing tokens before this #else, 744 // we have to start skipping lines until the matching #endif. 745 if (ReturnNextLiveToken) { 746 if (prepSkipRegion(ReturnNextLiveToken)) 747 return LexToken(); 748 749 return tgtok::Error; 750 } 751 752 // Return to the lines skipping code. 753 return Kind; 754 } else if (Kind == tgtok::Endif) { 755 // Check if this #endif is correct before calling prepSkipDirectiveEnd(), 756 // which will move CurPtr away from the beginning of #endif. 757 if (PrepIncludeStack.back()->empty()) 758 return ReturnError(TokStart, "#endif without #ifdef"); 759 760 auto &IfdefOrElseEntry = PrepIncludeStack.back()->back(); 761 762 if (IfdefOrElseEntry.Kind != tgtok::Ifdef && 763 IfdefOrElseEntry.Kind != tgtok::Else) { 764 PrintFatalError("Invalid preprocessor control on the stack"); 765 return tgtok::Error; 766 } 767 768 if (!prepSkipDirectiveEnd()) 769 return ReturnError(CurPtr, "Only comments are supported after #endif"); 770 771 PrepIncludeStack.back()->pop_back(); 772 773 // If we were processing tokens before this #endif, then 774 // we should continue it. 775 if (ReturnNextLiveToken) { 776 return LexToken(); 777 } 778 779 // Return to the lines skipping code. 780 return Kind; 781 } else if (Kind == tgtok::Define) { 782 StringRef MacroName = prepLexMacroName(); 783 if (MacroName.empty()) 784 return ReturnError(TokStart, "Expected macro name after #define"); 785 786 if (!DefinedMacros.insert(MacroName).second) 787 PrintWarning(getLoc(), 788 "Duplicate definition of macro: " + Twine(MacroName)); 789 790 if (!prepSkipDirectiveEnd()) 791 return ReturnError(CurPtr, 792 "Only comments are supported after #define NAME"); 793 794 if (!ReturnNextLiveToken) { 795 PrintFatalError("#define must be ignored during the lines skipping"); 796 return tgtok::Error; 797 } 798 799 return LexToken(); 800 } 801 802 PrintFatalError("Preprocessing directive is not supported"); 803 return tgtok::Error; 804 } 805 806 bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) { 807 if (!MustNeverBeFalse) 808 PrintFatalError("Invalid recursion."); 809 810 do { 811 // Skip all symbols to the line end. 812 prepSkipToLineEnd(); 813 814 // Find the first non-whitespace symbol in the next line(s). 815 if (!prepSkipLineBegin()) 816 return false; 817 818 // If the first non-blank/comment symbol on the line is '#', 819 // it may be a start of preprocessing directive. 820 // 821 // If it is not '#' just go to the next line. 822 if (*CurPtr == '#') 823 ++CurPtr; 824 else 825 continue; 826 827 tgtok::TokKind Kind = prepIsDirective(); 828 829 // If we did not find a preprocessing directive or it is #define, 830 // then just skip to the next line. We do not have to do anything 831 // for #define in the line-skipping mode. 832 if (Kind == tgtok::Error || Kind == tgtok::Define) 833 continue; 834 835 tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, false); 836 837 // If lexPreprocessor() encountered an error during lexing this 838 // preprocessor idiom, then return false to the calling lexPreprocessor(). 839 // This will force tgtok::Error to be returned to the tokens processing. 840 if (ProcessedKind == tgtok::Error) 841 return false; 842 843 if (Kind != ProcessedKind) 844 PrintFatalError("prepIsDirective() and lexPreprocessor() " 845 "returned different token kinds"); 846 847 // If this preprocessing directive enables tokens processing, 848 // then return to the lexPreprocessor() and get to the next token. 849 // We can move from line-skipping mode to processing tokens only 850 // due to #else or #endif. 851 if (prepIsProcessingEnabled()) { 852 if (Kind != tgtok::Else && Kind != tgtok::Endif) { 853 PrintFatalError("Tokens processing was enabled by an unexpected " 854 "preprocessing directive"); 855 return false; 856 } 857 858 return true; 859 } 860 } while (CurPtr != CurBuf.end()); 861 862 // We have reached the end of the file, but never left the lines-skipping 863 // mode. This means there is no matching #endif. 864 prepReportPreprocessorStackError(); 865 return false; 866 } 867 868 StringRef TGLexer::prepLexMacroName() { 869 // Skip whitespaces between the preprocessing directive and the macro name. 870 while (*CurPtr == ' ' || *CurPtr == '\t') 871 ++CurPtr; 872 873 TokStart = CurPtr; 874 // Macro names start with [a-zA-Z_]. 875 if (*CurPtr != '_' && !isalpha(*CurPtr)) 876 return ""; 877 878 // Match the rest of the identifier regex: [0-9a-zA-Z_]* 879 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 880 ++CurPtr; 881 882 return StringRef(TokStart, CurPtr - TokStart); 883 } 884 885 bool TGLexer::prepSkipLineBegin() { 886 while (CurPtr != CurBuf.end()) { 887 switch (*CurPtr) { 888 case ' ': 889 case '\t': 890 case '\n': 891 case '\r': 892 break; 893 894 case '/': { 895 int NextChar = peekNextChar(1); 896 if (NextChar == '*') { 897 // Skip C-style comment. 898 // Note that we do not care about skipping the C++-style comments. 899 // If the line contains "//", it may not contain any processable 900 // preprocessing directive. Just return CurPtr pointing to 901 // the first '/' in this case. We also do not care about 902 // incorrect symbols after the first '/' - we are in lines-skipping 903 // mode, so incorrect code is allowed to some extent. 904 905 // Set TokStart to the beginning of the comment to enable proper 906 // diagnostic printing in case of error in SkipCComment(). 907 TokStart = CurPtr; 908 909 // CurPtr must point to '*' before call to SkipCComment(). 910 ++CurPtr; 911 if (SkipCComment()) 912 return false; 913 } else { 914 // CurPtr points to the non-whitespace '/'. 915 return true; 916 } 917 918 // We must not increment CurPtr after the comment was lexed. 919 continue; 920 } 921 922 default: 923 return true; 924 } 925 926 ++CurPtr; 927 } 928 929 // We have reached the end of the file. Return to the lines skipping 930 // code, and allow it to handle the EOF as needed. 931 return true; 932 } 933 934 bool TGLexer::prepSkipDirectiveEnd() { 935 while (CurPtr != CurBuf.end()) { 936 switch (*CurPtr) { 937 case ' ': 938 case '\t': 939 break; 940 941 case '\n': 942 case '\r': 943 return true; 944 945 case '/': { 946 int NextChar = peekNextChar(1); 947 if (NextChar == '/') { 948 // Skip C++-style comment. 949 // We may just return true now, but let's skip to the line/buffer end 950 // to simplify the method specification. 951 ++CurPtr; 952 SkipBCPLComment(); 953 } else if (NextChar == '*') { 954 // When we are skipping C-style comment at the end of a preprocessing 955 // directive, we can skip several lines. If any meaningful TD token 956 // follows the end of the C-style comment on the same line, it will 957 // be considered as an invalid usage of TD token. 958 // For example, we want to forbid usages like this one: 959 // #define MACRO class Class {} 960 // But with C-style comments we also disallow the following: 961 // #define MACRO /* This macro is used 962 // to ... */ class Class {} 963 // One can argue that this should be allowed, but it does not seem 964 // to be worth of the complication. Moreover, this matches 965 // the C preprocessor behavior. 966 967 // Set TokStart to the beginning of the comment to enable proper 968 // diagnostic printer in case of error in SkipCComment(). 969 TokStart = CurPtr; 970 ++CurPtr; 971 if (SkipCComment()) 972 return false; 973 } else { 974 TokStart = CurPtr; 975 PrintError(CurPtr, "Unexpected character"); 976 return false; 977 } 978 979 // We must not increment CurPtr after the comment was lexed. 980 continue; 981 } 982 983 default: 984 // Do not allow any non-whitespaces after the directive. 985 TokStart = CurPtr; 986 return false; 987 } 988 989 ++CurPtr; 990 } 991 992 return true; 993 } 994 995 void TGLexer::prepSkipToLineEnd() { 996 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) 997 ++CurPtr; 998 } 999 1000 bool TGLexer::prepIsProcessingEnabled() { 1001 for (auto I = PrepIncludeStack.back()->rbegin(), 1002 E = PrepIncludeStack.back()->rend(); 1003 I != E; ++I) { 1004 if (!I->IsDefined) 1005 return false; 1006 } 1007 1008 return true; 1009 } 1010 1011 void TGLexer::prepReportPreprocessorStackError() { 1012 if (PrepIncludeStack.back()->empty()) 1013 PrintFatalError("prepReportPreprocessorStackError() called with " 1014 "empty control stack"); 1015 1016 auto &PrepControl = PrepIncludeStack.back()->back(); 1017 PrintError(CurBuf.end(), "Reached EOF without matching #endif"); 1018 PrintError(PrepControl.SrcPos, "The latest preprocessor control is here"); 1019 1020 TokStart = CurPtr; 1021 } 1022