1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implement the Lexer for TableGen. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "TGLexer.h" 14 #include "llvm/ADT/ArrayRef.h" 15 #include "llvm/ADT/StringSwitch.h" 16 #include "llvm/ADT/Twine.h" 17 #include "llvm/Config/config.h" // for strtoull()/strtoll() define 18 #include "llvm/Support/Compiler.h" 19 #include "llvm/Support/MemoryBuffer.h" 20 #include "llvm/Support/SourceMgr.h" 21 #include "llvm/TableGen/Error.h" 22 #include <algorithm> 23 #include <cctype> 24 #include <cerrno> 25 #include <cstdint> 26 #include <cstdio> 27 #include <cstdlib> 28 #include <cstring> 29 30 using namespace llvm; 31 32 namespace { 33 // A list of supported preprocessing directives with their 34 // internal token kinds and names. 35 struct { 36 tgtok::TokKind Kind; 37 const char *Word; 38 } PreprocessorDirs[] = { 39 { tgtok::Ifdef, "ifdef" }, 40 { tgtok::Ifndef, "ifndef" }, 41 { tgtok::Else, "else" }, 42 { tgtok::Endif, "endif" }, 43 { tgtok::Define, "define" } 44 }; 45 } // end anonymous namespace 46 47 TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) { 48 CurBuffer = SrcMgr.getMainFileID(); 49 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 50 CurPtr = CurBuf.begin(); 51 TokStart = nullptr; 52 53 // Pretend that we enter the "top-level" include file. 54 PrepIncludeStack.push_back( 55 std::make_unique<std::vector<PreprocessorControlDesc>>()); 56 57 // Put all macros defined in the command line into the DefinedMacros set. 58 for (const std::string &MacroName : Macros) 59 DefinedMacros.insert(MacroName); 60 } 61 62 SMLoc TGLexer::getLoc() const { 63 return SMLoc::getFromPointer(TokStart); 64 } 65 66 /// ReturnError - Set the error to the specified string at the specified 67 /// location. This is defined to always return tgtok::Error. 68 tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) { 69 PrintError(Loc, Msg); 70 return tgtok::Error; 71 } 72 73 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { 74 return ReturnError(SMLoc::getFromPointer(Loc), Msg); 75 } 76 77 bool TGLexer::processEOF() { 78 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); 79 if (ParentIncludeLoc != SMLoc()) { 80 // If prepExitInclude() detects a problem with the preprocessing 81 // control stack, it will return false. Pretend that we reached 82 // the final EOF and stop lexing more tokens by returning false 83 // to LexToken(). 84 if (!prepExitInclude(false)) 85 return false; 86 87 CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); 88 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 89 CurPtr = ParentIncludeLoc.getPointer(); 90 // Make sure TokStart points into the parent file's buffer. 91 // LexToken() assigns to it before calling getNextChar(), 92 // so it is pointing into the included file now. 93 TokStart = CurPtr; 94 return true; 95 } 96 97 // Pretend that we exit the "top-level" include file. 98 // Note that in case of an error (e.g. control stack imbalance) 99 // the routine will issue a fatal error. 100 prepExitInclude(true); 101 return false; 102 } 103 104 int TGLexer::getNextChar() { 105 char CurChar = *CurPtr++; 106 switch (CurChar) { 107 default: 108 return (unsigned char)CurChar; 109 110 case 0: { 111 // A NUL character in the stream is either the end of the current buffer or 112 // a spurious NUL in the file. Disambiguate that here. 113 if (CurPtr - 1 == CurBuf.end()) { 114 --CurPtr; // Arrange for another call to return EOF again. 115 return EOF; 116 } 117 PrintError(getLoc(), 118 "NUL character is invalid in source; treated as space"); 119 return ' '; 120 } 121 122 case '\n': 123 case '\r': 124 // Handle the newline character by ignoring it and incrementing the line 125 // count. However, be careful about 'dos style' files with \n\r in them. 126 // Only treat a \n\r or \r\n as a single line. 127 if ((*CurPtr == '\n' || (*CurPtr == '\r')) && 128 *CurPtr != CurChar) 129 ++CurPtr; // Eat the two char newline sequence. 130 return '\n'; 131 } 132 } 133 134 int TGLexer::peekNextChar(int Index) const { 135 return *(CurPtr + Index); 136 } 137 138 tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) { 139 TokStart = CurPtr; 140 // This always consumes at least one character. 141 int CurChar = getNextChar(); 142 143 switch (CurChar) { 144 default: 145 // Handle letters: [a-zA-Z_] 146 if (isalpha(CurChar) || CurChar == '_') 147 return LexIdentifier(); 148 149 // Unknown character, emit an error. 150 return ReturnError(TokStart, "Unexpected character"); 151 case EOF: 152 // Lex next token, if we just left an include file. 153 // Note that leaving an include file means that the next 154 // symbol is located at the end of the 'include "..."' 155 // construct, so LexToken() is called with default 156 // false parameter. 157 if (processEOF()) 158 return LexToken(); 159 160 // Return EOF denoting the end of lexing. 161 return tgtok::Eof; 162 163 case ':': return tgtok::colon; 164 case ';': return tgtok::semi; 165 case ',': return tgtok::comma; 166 case '<': return tgtok::less; 167 case '>': return tgtok::greater; 168 case ']': return tgtok::r_square; 169 case '{': return tgtok::l_brace; 170 case '}': return tgtok::r_brace; 171 case '(': return tgtok::l_paren; 172 case ')': return tgtok::r_paren; 173 case '=': return tgtok::equal; 174 case '?': return tgtok::question; 175 case '#': 176 if (FileOrLineStart) { 177 tgtok::TokKind Kind = prepIsDirective(); 178 if (Kind != tgtok::Error) 179 return lexPreprocessor(Kind); 180 } 181 182 return tgtok::paste; 183 184 // The period is a separate case so we can recognize the "..." 185 // range punctuator. 186 case '.': 187 if (peekNextChar(0) == '.') { 188 ++CurPtr; // Eat second dot. 189 if (peekNextChar(0) == '.') { 190 ++CurPtr; // Eat third dot. 191 return tgtok::dotdotdot; 192 } 193 return ReturnError(TokStart, "Invalid '..' punctuation"); 194 } 195 return tgtok::dot; 196 197 case '\r': 198 PrintFatalError("getNextChar() must never return '\r'"); 199 return tgtok::Error; 200 201 case ' ': 202 case '\t': 203 // Ignore whitespace. 204 return LexToken(FileOrLineStart); 205 case '\n': 206 // Ignore whitespace, and identify the new line. 207 return LexToken(true); 208 case '/': 209 // If this is the start of a // comment, skip until the end of the line or 210 // the end of the buffer. 211 if (*CurPtr == '/') 212 SkipBCPLComment(); 213 else if (*CurPtr == '*') { 214 if (SkipCComment()) 215 return tgtok::Error; 216 } else // Otherwise, this is an error. 217 return ReturnError(TokStart, "Unexpected character"); 218 return LexToken(FileOrLineStart); 219 case '-': case '+': 220 case '0': case '1': case '2': case '3': case '4': case '5': case '6': 221 case '7': case '8': case '9': { 222 int NextChar = 0; 223 if (isdigit(CurChar)) { 224 // Allow identifiers to start with a number if it is followed by 225 // an identifier. This can happen with paste operations like 226 // foo#8i. 227 int i = 0; 228 do { 229 NextChar = peekNextChar(i++); 230 } while (isdigit(NextChar)); 231 232 if (NextChar == 'x' || NextChar == 'b') { 233 // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most 234 // likely a number. 235 int NextNextChar = peekNextChar(i); 236 switch (NextNextChar) { 237 default: 238 break; 239 case '0': case '1': 240 if (NextChar == 'b') 241 return LexNumber(); 242 LLVM_FALLTHROUGH; 243 case '2': case '3': case '4': case '5': 244 case '6': case '7': case '8': case '9': 245 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 246 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 247 if (NextChar == 'x') 248 return LexNumber(); 249 break; 250 } 251 } 252 } 253 254 if (isalpha(NextChar) || NextChar == '_') 255 return LexIdentifier(); 256 257 return LexNumber(); 258 } 259 case '"': return LexString(); 260 case '$': return LexVarName(); 261 case '[': return LexBracket(); 262 case '!': return LexExclaim(); 263 } 264 } 265 266 /// LexString - Lex "[^"]*" 267 tgtok::TokKind TGLexer::LexString() { 268 const char *StrStart = CurPtr; 269 270 CurStrVal = ""; 271 272 while (*CurPtr != '"') { 273 // If we hit the end of the buffer, report an error. 274 if (*CurPtr == 0 && CurPtr == CurBuf.end()) 275 return ReturnError(StrStart, "End of file in string literal"); 276 277 if (*CurPtr == '\n' || *CurPtr == '\r') 278 return ReturnError(StrStart, "End of line in string literal"); 279 280 if (*CurPtr != '\\') { 281 CurStrVal += *CurPtr++; 282 continue; 283 } 284 285 ++CurPtr; 286 287 switch (*CurPtr) { 288 case '\\': case '\'': case '"': 289 // These turn into their literal character. 290 CurStrVal += *CurPtr++; 291 break; 292 case 't': 293 CurStrVal += '\t'; 294 ++CurPtr; 295 break; 296 case 'n': 297 CurStrVal += '\n'; 298 ++CurPtr; 299 break; 300 301 case '\n': 302 case '\r': 303 return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); 304 305 // If we hit the end of the buffer, report an error. 306 case '\0': 307 if (CurPtr == CurBuf.end()) 308 return ReturnError(StrStart, "End of file in string literal"); 309 LLVM_FALLTHROUGH; 310 default: 311 return ReturnError(CurPtr, "invalid escape in string literal"); 312 } 313 } 314 315 ++CurPtr; 316 return tgtok::StrVal; 317 } 318 319 tgtok::TokKind TGLexer::LexVarName() { 320 if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') 321 return ReturnError(TokStart, "Invalid variable name"); 322 323 // Otherwise, we're ok, consume the rest of the characters. 324 const char *VarNameStart = CurPtr++; 325 326 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 327 ++CurPtr; 328 329 CurStrVal.assign(VarNameStart, CurPtr); 330 return tgtok::VarName; 331 } 332 333 tgtok::TokKind TGLexer::LexIdentifier() { 334 // The first letter is [a-zA-Z_]. 335 const char *IdentStart = TokStart; 336 337 // Match the rest of the identifier regex: [0-9a-zA-Z_]* 338 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 339 ++CurPtr; 340 341 // Check to see if this identifier is a reserved keyword. 342 StringRef Str(IdentStart, CurPtr-IdentStart); 343 344 tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) 345 .Case("int", tgtok::Int) 346 .Case("bit", tgtok::Bit) 347 .Case("bits", tgtok::Bits) 348 .Case("string", tgtok::String) 349 .Case("list", tgtok::List) 350 .Case("code", tgtok::Code) 351 .Case("dag", tgtok::Dag) 352 .Case("class", tgtok::Class) 353 .Case("def", tgtok::Def) 354 .Case("true", tgtok::TrueVal) 355 .Case("false", tgtok::FalseVal) 356 .Case("foreach", tgtok::Foreach) 357 .Case("defm", tgtok::Defm) 358 .Case("defset", tgtok::Defset) 359 .Case("multiclass", tgtok::MultiClass) 360 .Case("field", tgtok::Field) 361 .Case("let", tgtok::Let) 362 .Case("in", tgtok::In) 363 .Case("defvar", tgtok::Defvar) 364 .Case("include", tgtok::Include) 365 .Case("if", tgtok::If) 366 .Case("then", tgtok::Then) 367 .Case("else", tgtok::ElseKW) 368 .Case("assert", tgtok::Assert) 369 .Default(tgtok::Id); 370 371 // A couple of tokens require special processing. 372 switch (Kind) { 373 case tgtok::Include: 374 if (LexInclude()) return tgtok::Error; 375 return Lex(); 376 case tgtok::Id: 377 CurStrVal.assign(Str.begin(), Str.end()); 378 break; 379 default: 380 break; 381 } 382 383 return Kind; 384 } 385 386 /// LexInclude - We just read the "include" token. Get the string token that 387 /// comes next and enter the include. 388 bool TGLexer::LexInclude() { 389 // The token after the include must be a string. 390 tgtok::TokKind Tok = LexToken(); 391 if (Tok == tgtok::Error) return true; 392 if (Tok != tgtok::StrVal) { 393 PrintError(getLoc(), "Expected filename after include"); 394 return true; 395 } 396 397 // Get the string. 398 std::string Filename = CurStrVal; 399 std::string IncludedFile; 400 401 CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), 402 IncludedFile); 403 if (!CurBuffer) { 404 PrintError(getLoc(), "Could not find include file '" + Filename + "'"); 405 return true; 406 } 407 408 Dependencies.insert(IncludedFile); 409 // Save the line number and lex buffer of the includer. 410 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 411 CurPtr = CurBuf.begin(); 412 413 PrepIncludeStack.push_back( 414 std::make_unique<std::vector<PreprocessorControlDesc>>()); 415 return false; 416 } 417 418 /// SkipBCPLComment - Skip over the comment by finding the next CR or LF. 419 /// Or we may end up at the end of the buffer. 420 void TGLexer::SkipBCPLComment() { 421 ++CurPtr; // skip the second slash. 422 auto EOLPos = CurBuf.find_first_of("\r\n", CurPtr - CurBuf.data()); 423 CurPtr = (EOLPos == StringRef::npos) ? CurBuf.end() : CurBuf.data() + EOLPos; 424 } 425 426 /// SkipCComment - This skips C-style /**/ comments. The only difference from C 427 /// is that we allow nesting. 428 bool TGLexer::SkipCComment() { 429 ++CurPtr; // skip the star. 430 unsigned CommentDepth = 1; 431 432 while (true) { 433 int CurChar = getNextChar(); 434 switch (CurChar) { 435 case EOF: 436 PrintError(TokStart, "Unterminated comment!"); 437 return true; 438 case '*': 439 // End of the comment? 440 if (CurPtr[0] != '/') break; 441 442 ++CurPtr; // End the */. 443 if (--CommentDepth == 0) 444 return false; 445 break; 446 case '/': 447 // Start of a nested comment? 448 if (CurPtr[0] != '*') break; 449 ++CurPtr; 450 ++CommentDepth; 451 break; 452 } 453 } 454 } 455 456 /// LexNumber - Lex: 457 /// [-+]?[0-9]+ 458 /// 0x[0-9a-fA-F]+ 459 /// 0b[01]+ 460 tgtok::TokKind TGLexer::LexNumber() { 461 if (CurPtr[-1] == '0') { 462 if (CurPtr[0] == 'x') { 463 ++CurPtr; 464 const char *NumStart = CurPtr; 465 while (isxdigit(CurPtr[0])) 466 ++CurPtr; 467 468 // Requires at least one hex digit. 469 if (CurPtr == NumStart) 470 return ReturnError(TokStart, "Invalid hexadecimal number"); 471 472 errno = 0; 473 CurIntVal = strtoll(NumStart, nullptr, 16); 474 if (errno == EINVAL) 475 return ReturnError(TokStart, "Invalid hexadecimal number"); 476 if (errno == ERANGE) { 477 errno = 0; 478 CurIntVal = (int64_t)strtoull(NumStart, nullptr, 16); 479 if (errno == EINVAL) 480 return ReturnError(TokStart, "Invalid hexadecimal number"); 481 if (errno == ERANGE) 482 return ReturnError(TokStart, "Hexadecimal number out of range"); 483 } 484 return tgtok::IntVal; 485 } else if (CurPtr[0] == 'b') { 486 ++CurPtr; 487 const char *NumStart = CurPtr; 488 while (CurPtr[0] == '0' || CurPtr[0] == '1') 489 ++CurPtr; 490 491 // Requires at least one binary digit. 492 if (CurPtr == NumStart) 493 return ReturnError(CurPtr-2, "Invalid binary number"); 494 CurIntVal = strtoll(NumStart, nullptr, 2); 495 return tgtok::BinaryIntVal; 496 } 497 } 498 499 // Check for a sign without a digit. 500 if (!isdigit(CurPtr[0])) { 501 if (CurPtr[-1] == '-') 502 return tgtok::minus; 503 else if (CurPtr[-1] == '+') 504 return tgtok::plus; 505 } 506 507 while (isdigit(CurPtr[0])) 508 ++CurPtr; 509 CurIntVal = strtoll(TokStart, nullptr, 10); 510 return tgtok::IntVal; 511 } 512 513 /// LexBracket - We just read '['. If this is a code block, return it, 514 /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' 515 tgtok::TokKind TGLexer::LexBracket() { 516 if (CurPtr[0] != '{') 517 return tgtok::l_square; 518 ++CurPtr; 519 const char *CodeStart = CurPtr; 520 while (true) { 521 int Char = getNextChar(); 522 if (Char == EOF) break; 523 524 if (Char != '}') continue; 525 526 Char = getNextChar(); 527 if (Char == EOF) break; 528 if (Char == ']') { 529 CurStrVal.assign(CodeStart, CurPtr-2); 530 return tgtok::CodeFragment; 531 } 532 } 533 534 return ReturnError(CodeStart - 2, "Unterminated code block"); 535 } 536 537 /// LexExclaim - Lex '!' and '![a-zA-Z]+'. 538 tgtok::TokKind TGLexer::LexExclaim() { 539 if (!isalpha(*CurPtr)) 540 return ReturnError(CurPtr - 1, "Invalid \"!operator\""); 541 542 const char *Start = CurPtr++; 543 while (isalpha(*CurPtr)) 544 ++CurPtr; 545 546 // Check to see which operator this is. 547 tgtok::TokKind Kind = 548 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) 549 .Case("eq", tgtok::XEq) 550 .Case("ne", tgtok::XNe) 551 .Case("le", tgtok::XLe) 552 .Case("lt", tgtok::XLt) 553 .Case("ge", tgtok::XGe) 554 .Case("gt", tgtok::XGt) 555 .Case("if", tgtok::XIf) 556 .Case("cond", tgtok::XCond) 557 .Case("isa", tgtok::XIsA) 558 .Case("head", tgtok::XHead) 559 .Case("tail", tgtok::XTail) 560 .Case("size", tgtok::XSize) 561 .Case("con", tgtok::XConcat) 562 .Case("dag", tgtok::XDag) 563 .Case("add", tgtok::XADD) 564 .Case("sub", tgtok::XSUB) 565 .Case("mul", tgtok::XMUL) 566 .Case("not", tgtok::XNOT) 567 .Case("and", tgtok::XAND) 568 .Case("or", tgtok::XOR) 569 .Case("xor", tgtok::XXOR) 570 .Case("shl", tgtok::XSHL) 571 .Case("sra", tgtok::XSRA) 572 .Case("srl", tgtok::XSRL) 573 .Case("cast", tgtok::XCast) 574 .Case("empty", tgtok::XEmpty) 575 .Case("subst", tgtok::XSubst) 576 .Case("foldl", tgtok::XFoldl) 577 .Case("foreach", tgtok::XForEach) 578 .Case("filter", tgtok::XFilter) 579 .Case("listconcat", tgtok::XListConcat) 580 .Case("listsplat", tgtok::XListSplat) 581 .Case("strconcat", tgtok::XStrConcat) 582 .Case("interleave", tgtok::XInterleave) 583 .Case("substr", tgtok::XSubstr) 584 .Case("find", tgtok::XFind) 585 .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated. 586 .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated. 587 .Case("exists", tgtok::XExists) 588 .Default(tgtok::Error); 589 590 return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); 591 } 592 593 bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) { 594 // Report an error, if preprocessor control stack for the current 595 // file is not empty. 596 if (!PrepIncludeStack.back()->empty()) { 597 prepReportPreprocessorStackError(); 598 599 return false; 600 } 601 602 // Pop the preprocessing controls from the include stack. 603 if (PrepIncludeStack.empty()) { 604 PrintFatalError("Preprocessor include stack is empty"); 605 } 606 607 PrepIncludeStack.pop_back(); 608 609 if (IncludeStackMustBeEmpty) { 610 if (!PrepIncludeStack.empty()) 611 PrintFatalError("Preprocessor include stack is not empty"); 612 } else { 613 if (PrepIncludeStack.empty()) 614 PrintFatalError("Preprocessor include stack is empty"); 615 } 616 617 return true; 618 } 619 620 tgtok::TokKind TGLexer::prepIsDirective() const { 621 for (const auto &PD : PreprocessorDirs) { 622 int NextChar = *CurPtr; 623 bool Match = true; 624 unsigned I = 0; 625 for (; I < strlen(PD.Word); ++I) { 626 if (NextChar != PD.Word[I]) { 627 Match = false; 628 break; 629 } 630 631 NextChar = peekNextChar(I + 1); 632 } 633 634 // Check for whitespace after the directive. If there is no whitespace, 635 // then we do not recognize it as a preprocessing directive. 636 if (Match) { 637 tgtok::TokKind Kind = PD.Kind; 638 639 // New line and EOF may follow only #else/#endif. It will be reported 640 // as an error for #ifdef/#define after the call to prepLexMacroName(). 641 if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF || 642 NextChar == '\n' || 643 // It looks like TableGen does not support '\r' as the actual 644 // carriage return, e.g. getNextChar() treats a single '\r' 645 // as '\n'. So we do the same here. 646 NextChar == '\r') 647 return Kind; 648 649 // Allow comments after some directives, e.g.: 650 // #else// OR #else/**/ 651 // #endif// OR #endif/**/ 652 // 653 // Note that we do allow comments after #ifdef/#define here, e.g. 654 // #ifdef/**/ AND #ifdef// 655 // #define/**/ AND #define// 656 // 657 // These cases will be reported as incorrect after calling 658 // prepLexMacroName(). We could have supported C-style comments 659 // after #ifdef/#define, but this would complicate the code 660 // for little benefit. 661 if (NextChar == '/') { 662 NextChar = peekNextChar(I + 1); 663 664 if (NextChar == '*' || NextChar == '/') 665 return Kind; 666 667 // Pretend that we do not recognize the directive. 668 } 669 } 670 } 671 672 return tgtok::Error; 673 } 674 675 bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) { 676 TokStart = CurPtr; 677 678 for (const auto &PD : PreprocessorDirs) 679 if (PD.Kind == Kind) { 680 // Advance CurPtr to the end of the preprocessing word. 681 CurPtr += strlen(PD.Word); 682 return true; 683 } 684 685 PrintFatalError("Unsupported preprocessing token in " 686 "prepEatPreprocessorDirective()"); 687 return false; 688 } 689 690 tgtok::TokKind TGLexer::lexPreprocessor( 691 tgtok::TokKind Kind, bool ReturnNextLiveToken) { 692 693 // We must be looking at a preprocessing directive. Eat it! 694 if (!prepEatPreprocessorDirective(Kind)) 695 PrintFatalError("lexPreprocessor() called for unknown " 696 "preprocessor directive"); 697 698 if (Kind == tgtok::Ifdef || Kind == tgtok::Ifndef) { 699 StringRef MacroName = prepLexMacroName(); 700 StringRef IfTokName = Kind == tgtok::Ifdef ? "#ifdef" : "#ifndef"; 701 if (MacroName.empty()) 702 return ReturnError(TokStart, "Expected macro name after " + IfTokName); 703 704 bool MacroIsDefined = DefinedMacros.count(MacroName) != 0; 705 706 // Canonicalize ifndef to ifdef equivalent 707 if (Kind == tgtok::Ifndef) { 708 MacroIsDefined = !MacroIsDefined; 709 Kind = tgtok::Ifdef; 710 } 711 712 // Regardless of whether we are processing tokens or not, 713 // we put the #ifdef control on stack. 714 PrepIncludeStack.back()->push_back( 715 {Kind, MacroIsDefined, SMLoc::getFromPointer(TokStart)}); 716 717 if (!prepSkipDirectiveEnd()) 718 return ReturnError(CurPtr, "Only comments are supported after " + 719 IfTokName + " NAME"); 720 721 // If we were not processing tokens before this #ifdef, 722 // then just return back to the lines skipping code. 723 if (!ReturnNextLiveToken) 724 return Kind; 725 726 // If we were processing tokens before this #ifdef, 727 // and the macro is defined, then just return the next token. 728 if (MacroIsDefined) 729 return LexToken(); 730 731 // We were processing tokens before this #ifdef, and the macro 732 // is not defined, so we have to start skipping the lines. 733 // If the skipping is successful, it will return the token following 734 // either #else or #endif corresponding to this #ifdef. 735 if (prepSkipRegion(ReturnNextLiveToken)) 736 return LexToken(); 737 738 return tgtok::Error; 739 } else if (Kind == tgtok::Else) { 740 // Check if this #else is correct before calling prepSkipDirectiveEnd(), 741 // which will move CurPtr away from the beginning of #else. 742 if (PrepIncludeStack.back()->empty()) 743 return ReturnError(TokStart, "#else without #ifdef or #ifndef"); 744 745 PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back()->back(); 746 747 if (IfdefEntry.Kind != tgtok::Ifdef) { 748 PrintError(TokStart, "double #else"); 749 return ReturnError(IfdefEntry.SrcPos, "Previous #else is here"); 750 } 751 752 // Replace the corresponding #ifdef's control with its negation 753 // on the control stack. 754 PrepIncludeStack.back()->pop_back(); 755 PrepIncludeStack.back()->push_back( 756 {Kind, !IfdefEntry.IsDefined, SMLoc::getFromPointer(TokStart)}); 757 758 if (!prepSkipDirectiveEnd()) 759 return ReturnError(CurPtr, "Only comments are supported after #else"); 760 761 // If we were processing tokens before this #else, 762 // we have to start skipping lines until the matching #endif. 763 if (ReturnNextLiveToken) { 764 if (prepSkipRegion(ReturnNextLiveToken)) 765 return LexToken(); 766 767 return tgtok::Error; 768 } 769 770 // Return to the lines skipping code. 771 return Kind; 772 } else if (Kind == tgtok::Endif) { 773 // Check if this #endif is correct before calling prepSkipDirectiveEnd(), 774 // which will move CurPtr away from the beginning of #endif. 775 if (PrepIncludeStack.back()->empty()) 776 return ReturnError(TokStart, "#endif without #ifdef"); 777 778 auto &IfdefOrElseEntry = PrepIncludeStack.back()->back(); 779 780 if (IfdefOrElseEntry.Kind != tgtok::Ifdef && 781 IfdefOrElseEntry.Kind != tgtok::Else) { 782 PrintFatalError("Invalid preprocessor control on the stack"); 783 return tgtok::Error; 784 } 785 786 if (!prepSkipDirectiveEnd()) 787 return ReturnError(CurPtr, "Only comments are supported after #endif"); 788 789 PrepIncludeStack.back()->pop_back(); 790 791 // If we were processing tokens before this #endif, then 792 // we should continue it. 793 if (ReturnNextLiveToken) { 794 return LexToken(); 795 } 796 797 // Return to the lines skipping code. 798 return Kind; 799 } else if (Kind == tgtok::Define) { 800 StringRef MacroName = prepLexMacroName(); 801 if (MacroName.empty()) 802 return ReturnError(TokStart, "Expected macro name after #define"); 803 804 if (!DefinedMacros.insert(MacroName).second) 805 PrintWarning(getLoc(), 806 "Duplicate definition of macro: " + Twine(MacroName)); 807 808 if (!prepSkipDirectiveEnd()) 809 return ReturnError(CurPtr, 810 "Only comments are supported after #define NAME"); 811 812 if (!ReturnNextLiveToken) { 813 PrintFatalError("#define must be ignored during the lines skipping"); 814 return tgtok::Error; 815 } 816 817 return LexToken(); 818 } 819 820 PrintFatalError("Preprocessing directive is not supported"); 821 return tgtok::Error; 822 } 823 824 bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) { 825 if (!MustNeverBeFalse) 826 PrintFatalError("Invalid recursion."); 827 828 do { 829 // Skip all symbols to the line end. 830 prepSkipToLineEnd(); 831 832 // Find the first non-whitespace symbol in the next line(s). 833 if (!prepSkipLineBegin()) 834 return false; 835 836 // If the first non-blank/comment symbol on the line is '#', 837 // it may be a start of preprocessing directive. 838 // 839 // If it is not '#' just go to the next line. 840 if (*CurPtr == '#') 841 ++CurPtr; 842 else 843 continue; 844 845 tgtok::TokKind Kind = prepIsDirective(); 846 847 // If we did not find a preprocessing directive or it is #define, 848 // then just skip to the next line. We do not have to do anything 849 // for #define in the line-skipping mode. 850 if (Kind == tgtok::Error || Kind == tgtok::Define) 851 continue; 852 853 tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, false); 854 855 // If lexPreprocessor() encountered an error during lexing this 856 // preprocessor idiom, then return false to the calling lexPreprocessor(). 857 // This will force tgtok::Error to be returned to the tokens processing. 858 if (ProcessedKind == tgtok::Error) 859 return false; 860 861 if (Kind != ProcessedKind) 862 PrintFatalError("prepIsDirective() and lexPreprocessor() " 863 "returned different token kinds"); 864 865 // If this preprocessing directive enables tokens processing, 866 // then return to the lexPreprocessor() and get to the next token. 867 // We can move from line-skipping mode to processing tokens only 868 // due to #else or #endif. 869 if (prepIsProcessingEnabled()) { 870 if (Kind != tgtok::Else && Kind != tgtok::Endif) { 871 PrintFatalError("Tokens processing was enabled by an unexpected " 872 "preprocessing directive"); 873 return false; 874 } 875 876 return true; 877 } 878 } while (CurPtr != CurBuf.end()); 879 880 // We have reached the end of the file, but never left the lines-skipping 881 // mode. This means there is no matching #endif. 882 prepReportPreprocessorStackError(); 883 return false; 884 } 885 886 StringRef TGLexer::prepLexMacroName() { 887 // Skip whitespaces between the preprocessing directive and the macro name. 888 while (*CurPtr == ' ' || *CurPtr == '\t') 889 ++CurPtr; 890 891 TokStart = CurPtr; 892 // Macro names start with [a-zA-Z_]. 893 if (*CurPtr != '_' && !isalpha(*CurPtr)) 894 return ""; 895 896 // Match the rest of the identifier regex: [0-9a-zA-Z_]* 897 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 898 ++CurPtr; 899 900 return StringRef(TokStart, CurPtr - TokStart); 901 } 902 903 bool TGLexer::prepSkipLineBegin() { 904 while (CurPtr != CurBuf.end()) { 905 switch (*CurPtr) { 906 case ' ': 907 case '\t': 908 case '\n': 909 case '\r': 910 break; 911 912 case '/': { 913 int NextChar = peekNextChar(1); 914 if (NextChar == '*') { 915 // Skip C-style comment. 916 // Note that we do not care about skipping the C++-style comments. 917 // If the line contains "//", it may not contain any processable 918 // preprocessing directive. Just return CurPtr pointing to 919 // the first '/' in this case. We also do not care about 920 // incorrect symbols after the first '/' - we are in lines-skipping 921 // mode, so incorrect code is allowed to some extent. 922 923 // Set TokStart to the beginning of the comment to enable proper 924 // diagnostic printing in case of error in SkipCComment(). 925 TokStart = CurPtr; 926 927 // CurPtr must point to '*' before call to SkipCComment(). 928 ++CurPtr; 929 if (SkipCComment()) 930 return false; 931 } else { 932 // CurPtr points to the non-whitespace '/'. 933 return true; 934 } 935 936 // We must not increment CurPtr after the comment was lexed. 937 continue; 938 } 939 940 default: 941 return true; 942 } 943 944 ++CurPtr; 945 } 946 947 // We have reached the end of the file. Return to the lines skipping 948 // code, and allow it to handle the EOF as needed. 949 return true; 950 } 951 952 bool TGLexer::prepSkipDirectiveEnd() { 953 while (CurPtr != CurBuf.end()) { 954 switch (*CurPtr) { 955 case ' ': 956 case '\t': 957 break; 958 959 case '\n': 960 case '\r': 961 return true; 962 963 case '/': { 964 int NextChar = peekNextChar(1); 965 if (NextChar == '/') { 966 // Skip C++-style comment. 967 // We may just return true now, but let's skip to the line/buffer end 968 // to simplify the method specification. 969 ++CurPtr; 970 SkipBCPLComment(); 971 } else if (NextChar == '*') { 972 // When we are skipping C-style comment at the end of a preprocessing 973 // directive, we can skip several lines. If any meaningful TD token 974 // follows the end of the C-style comment on the same line, it will 975 // be considered as an invalid usage of TD token. 976 // For example, we want to forbid usages like this one: 977 // #define MACRO class Class {} 978 // But with C-style comments we also disallow the following: 979 // #define MACRO /* This macro is used 980 // to ... */ class Class {} 981 // One can argue that this should be allowed, but it does not seem 982 // to be worth of the complication. Moreover, this matches 983 // the C preprocessor behavior. 984 985 // Set TokStart to the beginning of the comment to enable proper 986 // diagnostic printer in case of error in SkipCComment(). 987 TokStart = CurPtr; 988 ++CurPtr; 989 if (SkipCComment()) 990 return false; 991 } else { 992 TokStart = CurPtr; 993 PrintError(CurPtr, "Unexpected character"); 994 return false; 995 } 996 997 // We must not increment CurPtr after the comment was lexed. 998 continue; 999 } 1000 1001 default: 1002 // Do not allow any non-whitespaces after the directive. 1003 TokStart = CurPtr; 1004 return false; 1005 } 1006 1007 ++CurPtr; 1008 } 1009 1010 return true; 1011 } 1012 1013 void TGLexer::prepSkipToLineEnd() { 1014 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) 1015 ++CurPtr; 1016 } 1017 1018 bool TGLexer::prepIsProcessingEnabled() { 1019 for (const PreprocessorControlDesc &I : 1020 llvm::reverse(*PrepIncludeStack.back())) 1021 if (!I.IsDefined) 1022 return false; 1023 1024 return true; 1025 } 1026 1027 void TGLexer::prepReportPreprocessorStackError() { 1028 if (PrepIncludeStack.back()->empty()) 1029 PrintFatalError("prepReportPreprocessorStackError() called with " 1030 "empty control stack"); 1031 1032 auto &PrepControl = PrepIncludeStack.back()->back(); 1033 PrintError(CurBuf.end(), "Reached EOF without matching #endif"); 1034 PrintError(PrepControl.SrcPos, "The latest preprocessor control is here"); 1035 1036 TokStart = CurPtr; 1037 } 1038