1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implement the Lexer for TableGen. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "TGLexer.h" 14 #include "llvm/ADT/ArrayRef.h" 15 #include "llvm/ADT/StringSwitch.h" 16 #include "llvm/ADT/Twine.h" 17 #include "llvm/Config/config.h" // for strtoull()/strtoll() define 18 #include "llvm/Support/Compiler.h" 19 #include "llvm/Support/MemoryBuffer.h" 20 #include "llvm/Support/SourceMgr.h" 21 #include "llvm/TableGen/Error.h" 22 #include <algorithm> 23 #include <cctype> 24 #include <cerrno> 25 #include <cstdint> 26 #include <cstdio> 27 #include <cstdlib> 28 #include <cstring> 29 30 using namespace llvm; 31 32 namespace { 33 // A list of supported preprocessing directives with their 34 // internal token kinds and names. 35 struct { 36 tgtok::TokKind Kind; 37 const char *Word; 38 } PreprocessorDirs[] = { 39 { tgtok::Ifdef, "ifdef" }, 40 { tgtok::Ifndef, "ifndef" }, 41 { tgtok::Else, "else" }, 42 { tgtok::Endif, "endif" }, 43 { tgtok::Define, "define" } 44 }; 45 } // end anonymous namespace 46 47 TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) { 48 CurBuffer = SrcMgr.getMainFileID(); 49 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 50 CurPtr = CurBuf.begin(); 51 TokStart = nullptr; 52 53 // Pretend that we enter the "top-level" include file. 54 PrepIncludeStack.push_back( 55 std::make_unique<std::vector<PreprocessorControlDesc>>()); 56 57 // Put all macros defined in the command line into the DefinedMacros set. 58 std::for_each(Macros.begin(), Macros.end(), 59 [this](const std::string &MacroName) { 60 DefinedMacros.insert(MacroName); 61 }); 62 } 63 64 SMLoc TGLexer::getLoc() const { 65 return SMLoc::getFromPointer(TokStart); 66 } 67 68 /// ReturnError - Set the error to the specified string at the specified 69 /// location. This is defined to always return tgtok::Error. 70 tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) { 71 PrintError(Loc, Msg); 72 return tgtok::Error; 73 } 74 75 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { 76 return ReturnError(SMLoc::getFromPointer(Loc), Msg); 77 } 78 79 bool TGLexer::processEOF() { 80 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); 81 if (ParentIncludeLoc != SMLoc()) { 82 // If prepExitInclude() detects a problem with the preprocessing 83 // control stack, it will return false. Pretend that we reached 84 // the final EOF and stop lexing more tokens by returning false 85 // to LexToken(). 86 if (!prepExitInclude(false)) 87 return false; 88 89 CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); 90 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 91 CurPtr = ParentIncludeLoc.getPointer(); 92 // Make sure TokStart points into the parent file's buffer. 93 // LexToken() assigns to it before calling getNextChar(), 94 // so it is pointing into the included file now. 95 TokStart = CurPtr; 96 return true; 97 } 98 99 // Pretend that we exit the "top-level" include file. 100 // Note that in case of an error (e.g. control stack imbalance) 101 // the routine will issue a fatal error. 102 prepExitInclude(true); 103 return false; 104 } 105 106 int TGLexer::getNextChar() { 107 char CurChar = *CurPtr++; 108 switch (CurChar) { 109 default: 110 return (unsigned char)CurChar; 111 112 case 0: { 113 // A NUL character in the stream is either the end of the current buffer or 114 // a spurious NUL in the file. Disambiguate that here. 115 if (CurPtr - 1 == CurBuf.end()) { 116 --CurPtr; // Arrange for another call to return EOF again. 117 return EOF; 118 } 119 PrintError(getLoc(), 120 "NUL character is invalid in source; treated as space"); 121 return ' '; 122 } 123 124 case '\n': 125 case '\r': 126 // Handle the newline character by ignoring it and incrementing the line 127 // count. However, be careful about 'dos style' files with \n\r in them. 128 // Only treat a \n\r or \r\n as a single line. 129 if ((*CurPtr == '\n' || (*CurPtr == '\r')) && 130 *CurPtr != CurChar) 131 ++CurPtr; // Eat the two char newline sequence. 132 return '\n'; 133 } 134 } 135 136 int TGLexer::peekNextChar(int Index) const { 137 return *(CurPtr + Index); 138 } 139 140 tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) { 141 TokStart = CurPtr; 142 // This always consumes at least one character. 143 int CurChar = getNextChar(); 144 145 switch (CurChar) { 146 default: 147 // Handle letters: [a-zA-Z_] 148 if (isalpha(CurChar) || CurChar == '_') 149 return LexIdentifier(); 150 151 // Unknown character, emit an error. 152 return ReturnError(TokStart, "Unexpected character"); 153 case EOF: 154 // Lex next token, if we just left an include file. 155 // Note that leaving an include file means that the next 156 // symbol is located at the end of the 'include "..."' 157 // construct, so LexToken() is called with default 158 // false parameter. 159 if (processEOF()) 160 return LexToken(); 161 162 // Return EOF denoting the end of lexing. 163 return tgtok::Eof; 164 165 case ':': return tgtok::colon; 166 case ';': return tgtok::semi; 167 case ',': return tgtok::comma; 168 case '<': return tgtok::less; 169 case '>': return tgtok::greater; 170 case ']': return tgtok::r_square; 171 case '{': return tgtok::l_brace; 172 case '}': return tgtok::r_brace; 173 case '(': return tgtok::l_paren; 174 case ')': return tgtok::r_paren; 175 case '=': return tgtok::equal; 176 case '?': return tgtok::question; 177 case '#': 178 if (FileOrLineStart) { 179 tgtok::TokKind Kind = prepIsDirective(); 180 if (Kind != tgtok::Error) 181 return lexPreprocessor(Kind); 182 } 183 184 return tgtok::paste; 185 186 // The period is a separate case so we can recognize the "..." 187 // range punctuator. 188 case '.': 189 if (peekNextChar(0) == '.') { 190 ++CurPtr; // Eat second dot. 191 if (peekNextChar(0) == '.') { 192 ++CurPtr; // Eat third dot. 193 return tgtok::dotdotdot; 194 } 195 return ReturnError(TokStart, "Invalid '..' punctuation"); 196 } 197 return tgtok::dot; 198 199 case '\r': 200 PrintFatalError("getNextChar() must never return '\r'"); 201 return tgtok::Error; 202 203 case ' ': 204 case '\t': 205 // Ignore whitespace. 206 return LexToken(FileOrLineStart); 207 case '\n': 208 // Ignore whitespace, and identify the new line. 209 return LexToken(true); 210 case '/': 211 // If this is the start of a // comment, skip until the end of the line or 212 // the end of the buffer. 213 if (*CurPtr == '/') 214 SkipBCPLComment(); 215 else if (*CurPtr == '*') { 216 if (SkipCComment()) 217 return tgtok::Error; 218 } else // Otherwise, this is an error. 219 return ReturnError(TokStart, "Unexpected character"); 220 return LexToken(FileOrLineStart); 221 case '-': case '+': 222 case '0': case '1': case '2': case '3': case '4': case '5': case '6': 223 case '7': case '8': case '9': { 224 int NextChar = 0; 225 if (isdigit(CurChar)) { 226 // Allow identifiers to start with a number if it is followed by 227 // an identifier. This can happen with paste operations like 228 // foo#8i. 229 int i = 0; 230 do { 231 NextChar = peekNextChar(i++); 232 } while (isdigit(NextChar)); 233 234 if (NextChar == 'x' || NextChar == 'b') { 235 // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most 236 // likely a number. 237 int NextNextChar = peekNextChar(i); 238 switch (NextNextChar) { 239 default: 240 break; 241 case '0': case '1': 242 if (NextChar == 'b') 243 return LexNumber(); 244 LLVM_FALLTHROUGH; 245 case '2': case '3': case '4': case '5': 246 case '6': case '7': case '8': case '9': 247 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 248 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 249 if (NextChar == 'x') 250 return LexNumber(); 251 break; 252 } 253 } 254 } 255 256 if (isalpha(NextChar) || NextChar == '_') 257 return LexIdentifier(); 258 259 return LexNumber(); 260 } 261 case '"': return LexString(); 262 case '$': return LexVarName(); 263 case '[': return LexBracket(); 264 case '!': return LexExclaim(); 265 } 266 } 267 268 /// LexString - Lex "[^"]*" 269 tgtok::TokKind TGLexer::LexString() { 270 const char *StrStart = CurPtr; 271 272 CurStrVal = ""; 273 274 while (*CurPtr != '"') { 275 // If we hit the end of the buffer, report an error. 276 if (*CurPtr == 0 && CurPtr == CurBuf.end()) 277 return ReturnError(StrStart, "End of file in string literal"); 278 279 if (*CurPtr == '\n' || *CurPtr == '\r') 280 return ReturnError(StrStart, "End of line in string literal"); 281 282 if (*CurPtr != '\\') { 283 CurStrVal += *CurPtr++; 284 continue; 285 } 286 287 ++CurPtr; 288 289 switch (*CurPtr) { 290 case '\\': case '\'': case '"': 291 // These turn into their literal character. 292 CurStrVal += *CurPtr++; 293 break; 294 case 't': 295 CurStrVal += '\t'; 296 ++CurPtr; 297 break; 298 case 'n': 299 CurStrVal += '\n'; 300 ++CurPtr; 301 break; 302 303 case '\n': 304 case '\r': 305 return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); 306 307 // If we hit the end of the buffer, report an error. 308 case '\0': 309 if (CurPtr == CurBuf.end()) 310 return ReturnError(StrStart, "End of file in string literal"); 311 LLVM_FALLTHROUGH; 312 default: 313 return ReturnError(CurPtr, "invalid escape in string literal"); 314 } 315 } 316 317 ++CurPtr; 318 return tgtok::StrVal; 319 } 320 321 tgtok::TokKind TGLexer::LexVarName() { 322 if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') 323 return ReturnError(TokStart, "Invalid variable name"); 324 325 // Otherwise, we're ok, consume the rest of the characters. 326 const char *VarNameStart = CurPtr++; 327 328 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 329 ++CurPtr; 330 331 CurStrVal.assign(VarNameStart, CurPtr); 332 return tgtok::VarName; 333 } 334 335 tgtok::TokKind TGLexer::LexIdentifier() { 336 // The first letter is [a-zA-Z_]. 337 const char *IdentStart = TokStart; 338 339 // Match the rest of the identifier regex: [0-9a-zA-Z_]* 340 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 341 ++CurPtr; 342 343 // Check to see if this identifier is a reserved keyword. 344 StringRef Str(IdentStart, CurPtr-IdentStart); 345 346 tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) 347 .Case("int", tgtok::Int) 348 .Case("bit", tgtok::Bit) 349 .Case("bits", tgtok::Bits) 350 .Case("string", tgtok::String) 351 .Case("list", tgtok::List) 352 .Case("code", tgtok::Code) 353 .Case("dag", tgtok::Dag) 354 .Case("class", tgtok::Class) 355 .Case("def", tgtok::Def) 356 .Case("true", tgtok::TrueVal) 357 .Case("false", tgtok::FalseVal) 358 .Case("foreach", tgtok::Foreach) 359 .Case("defm", tgtok::Defm) 360 .Case("defset", tgtok::Defset) 361 .Case("multiclass", tgtok::MultiClass) 362 .Case("field", tgtok::Field) 363 .Case("let", tgtok::Let) 364 .Case("in", tgtok::In) 365 .Case("defvar", tgtok::Defvar) 366 .Case("include", tgtok::Include) 367 .Case("if", tgtok::If) 368 .Case("then", tgtok::Then) 369 .Case("else", tgtok::ElseKW) 370 .Case("assert", tgtok::Assert) 371 .Default(tgtok::Id); 372 373 // A couple of tokens require special processing. 374 switch (Kind) { 375 case tgtok::Include: 376 if (LexInclude()) return tgtok::Error; 377 return Lex(); 378 case tgtok::Id: 379 CurStrVal.assign(Str.begin(), Str.end()); 380 break; 381 default: 382 break; 383 } 384 385 return Kind; 386 } 387 388 /// LexInclude - We just read the "include" token. Get the string token that 389 /// comes next and enter the include. 390 bool TGLexer::LexInclude() { 391 // The token after the include must be a string. 392 tgtok::TokKind Tok = LexToken(); 393 if (Tok == tgtok::Error) return true; 394 if (Tok != tgtok::StrVal) { 395 PrintError(getLoc(), "Expected filename after include"); 396 return true; 397 } 398 399 // Get the string. 400 std::string Filename = CurStrVal; 401 std::string IncludedFile; 402 403 CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), 404 IncludedFile); 405 if (!CurBuffer) { 406 PrintError(getLoc(), "Could not find include file '" + Filename + "'"); 407 return true; 408 } 409 410 Dependencies.insert(IncludedFile); 411 // Save the line number and lex buffer of the includer. 412 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 413 CurPtr = CurBuf.begin(); 414 415 PrepIncludeStack.push_back( 416 std::make_unique<std::vector<PreprocessorControlDesc>>()); 417 return false; 418 } 419 420 /// SkipBCPLComment - Skip over the comment by finding the next CR or LF. 421 /// Or we may end up at the end of the buffer. 422 void TGLexer::SkipBCPLComment() { 423 ++CurPtr; // skip the second slash. 424 auto EOLPos = CurBuf.find_first_of("\r\n", CurPtr - CurBuf.data()); 425 CurPtr = (EOLPos == StringRef::npos) ? CurBuf.end() : CurBuf.data() + EOLPos; 426 } 427 428 /// SkipCComment - This skips C-style /**/ comments. The only difference from C 429 /// is that we allow nesting. 430 bool TGLexer::SkipCComment() { 431 ++CurPtr; // skip the star. 432 unsigned CommentDepth = 1; 433 434 while (true) { 435 int CurChar = getNextChar(); 436 switch (CurChar) { 437 case EOF: 438 PrintError(TokStart, "Unterminated comment!"); 439 return true; 440 case '*': 441 // End of the comment? 442 if (CurPtr[0] != '/') break; 443 444 ++CurPtr; // End the */. 445 if (--CommentDepth == 0) 446 return false; 447 break; 448 case '/': 449 // Start of a nested comment? 450 if (CurPtr[0] != '*') break; 451 ++CurPtr; 452 ++CommentDepth; 453 break; 454 } 455 } 456 } 457 458 /// LexNumber - Lex: 459 /// [-+]?[0-9]+ 460 /// 0x[0-9a-fA-F]+ 461 /// 0b[01]+ 462 tgtok::TokKind TGLexer::LexNumber() { 463 if (CurPtr[-1] == '0') { 464 if (CurPtr[0] == 'x') { 465 ++CurPtr; 466 const char *NumStart = CurPtr; 467 while (isxdigit(CurPtr[0])) 468 ++CurPtr; 469 470 // Requires at least one hex digit. 471 if (CurPtr == NumStart) 472 return ReturnError(TokStart, "Invalid hexadecimal number"); 473 474 errno = 0; 475 CurIntVal = strtoll(NumStart, nullptr, 16); 476 if (errno == EINVAL) 477 return ReturnError(TokStart, "Invalid hexadecimal number"); 478 if (errno == ERANGE) { 479 errno = 0; 480 CurIntVal = (int64_t)strtoull(NumStart, nullptr, 16); 481 if (errno == EINVAL) 482 return ReturnError(TokStart, "Invalid hexadecimal number"); 483 if (errno == ERANGE) 484 return ReturnError(TokStart, "Hexadecimal number out of range"); 485 } 486 return tgtok::IntVal; 487 } else if (CurPtr[0] == 'b') { 488 ++CurPtr; 489 const char *NumStart = CurPtr; 490 while (CurPtr[0] == '0' || CurPtr[0] == '1') 491 ++CurPtr; 492 493 // Requires at least one binary digit. 494 if (CurPtr == NumStart) 495 return ReturnError(CurPtr-2, "Invalid binary number"); 496 CurIntVal = strtoll(NumStart, nullptr, 2); 497 return tgtok::BinaryIntVal; 498 } 499 } 500 501 // Check for a sign without a digit. 502 if (!isdigit(CurPtr[0])) { 503 if (CurPtr[-1] == '-') 504 return tgtok::minus; 505 else if (CurPtr[-1] == '+') 506 return tgtok::plus; 507 } 508 509 while (isdigit(CurPtr[0])) 510 ++CurPtr; 511 CurIntVal = strtoll(TokStart, nullptr, 10); 512 return tgtok::IntVal; 513 } 514 515 /// LexBracket - We just read '['. If this is a code block, return it, 516 /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' 517 tgtok::TokKind TGLexer::LexBracket() { 518 if (CurPtr[0] != '{') 519 return tgtok::l_square; 520 ++CurPtr; 521 const char *CodeStart = CurPtr; 522 while (true) { 523 int Char = getNextChar(); 524 if (Char == EOF) break; 525 526 if (Char != '}') continue; 527 528 Char = getNextChar(); 529 if (Char == EOF) break; 530 if (Char == ']') { 531 CurStrVal.assign(CodeStart, CurPtr-2); 532 return tgtok::CodeFragment; 533 } 534 } 535 536 return ReturnError(CodeStart - 2, "Unterminated code block"); 537 } 538 539 /// LexExclaim - Lex '!' and '![a-zA-Z]+'. 540 tgtok::TokKind TGLexer::LexExclaim() { 541 if (!isalpha(*CurPtr)) 542 return ReturnError(CurPtr - 1, "Invalid \"!operator\""); 543 544 const char *Start = CurPtr++; 545 while (isalpha(*CurPtr)) 546 ++CurPtr; 547 548 // Check to see which operator this is. 549 tgtok::TokKind Kind = 550 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) 551 .Case("eq", tgtok::XEq) 552 .Case("ne", tgtok::XNe) 553 .Case("le", tgtok::XLe) 554 .Case("lt", tgtok::XLt) 555 .Case("ge", tgtok::XGe) 556 .Case("gt", tgtok::XGt) 557 .Case("if", tgtok::XIf) 558 .Case("cond", tgtok::XCond) 559 .Case("isa", tgtok::XIsA) 560 .Case("head", tgtok::XHead) 561 .Case("tail", tgtok::XTail) 562 .Case("size", tgtok::XSize) 563 .Case("con", tgtok::XConcat) 564 .Case("dag", tgtok::XDag) 565 .Case("add", tgtok::XADD) 566 .Case("sub", tgtok::XSUB) 567 .Case("mul", tgtok::XMUL) 568 .Case("not", tgtok::XNOT) 569 .Case("and", tgtok::XAND) 570 .Case("or", tgtok::XOR) 571 .Case("xor", tgtok::XXOR) 572 .Case("shl", tgtok::XSHL) 573 .Case("sra", tgtok::XSRA) 574 .Case("srl", tgtok::XSRL) 575 .Case("cast", tgtok::XCast) 576 .Case("empty", tgtok::XEmpty) 577 .Case("subst", tgtok::XSubst) 578 .Case("foldl", tgtok::XFoldl) 579 .Case("foreach", tgtok::XForEach) 580 .Case("filter", tgtok::XFilter) 581 .Case("listconcat", tgtok::XListConcat) 582 .Case("listsplat", tgtok::XListSplat) 583 .Case("strconcat", tgtok::XStrConcat) 584 .Case("interleave", tgtok::XInterleave) 585 .Case("substr", tgtok::XSubstr) 586 .Case("find", tgtok::XFind) 587 .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated. 588 .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated. 589 .Default(tgtok::Error); 590 591 return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); 592 } 593 594 bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) { 595 // Report an error, if preprocessor control stack for the current 596 // file is not empty. 597 if (!PrepIncludeStack.back()->empty()) { 598 prepReportPreprocessorStackError(); 599 600 return false; 601 } 602 603 // Pop the preprocessing controls from the include stack. 604 if (PrepIncludeStack.empty()) { 605 PrintFatalError("Preprocessor include stack is empty"); 606 } 607 608 PrepIncludeStack.pop_back(); 609 610 if (IncludeStackMustBeEmpty) { 611 if (!PrepIncludeStack.empty()) 612 PrintFatalError("Preprocessor include stack is not empty"); 613 } else { 614 if (PrepIncludeStack.empty()) 615 PrintFatalError("Preprocessor include stack is empty"); 616 } 617 618 return true; 619 } 620 621 tgtok::TokKind TGLexer::prepIsDirective() const { 622 for (const auto &PD : PreprocessorDirs) { 623 int NextChar = *CurPtr; 624 bool Match = true; 625 unsigned I = 0; 626 for (; I < strlen(PD.Word); ++I) { 627 if (NextChar != PD.Word[I]) { 628 Match = false; 629 break; 630 } 631 632 NextChar = peekNextChar(I + 1); 633 } 634 635 // Check for whitespace after the directive. If there is no whitespace, 636 // then we do not recognize it as a preprocessing directive. 637 if (Match) { 638 tgtok::TokKind Kind = PD.Kind; 639 640 // New line and EOF may follow only #else/#endif. It will be reported 641 // as an error for #ifdef/#define after the call to prepLexMacroName(). 642 if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF || 643 NextChar == '\n' || 644 // It looks like TableGen does not support '\r' as the actual 645 // carriage return, e.g. getNextChar() treats a single '\r' 646 // as '\n'. So we do the same here. 647 NextChar == '\r') 648 return Kind; 649 650 // Allow comments after some directives, e.g.: 651 // #else// OR #else/**/ 652 // #endif// OR #endif/**/ 653 // 654 // Note that we do allow comments after #ifdef/#define here, e.g. 655 // #ifdef/**/ AND #ifdef// 656 // #define/**/ AND #define// 657 // 658 // These cases will be reported as incorrect after calling 659 // prepLexMacroName(). We could have supported C-style comments 660 // after #ifdef/#define, but this would complicate the code 661 // for little benefit. 662 if (NextChar == '/') { 663 NextChar = peekNextChar(I + 1); 664 665 if (NextChar == '*' || NextChar == '/') 666 return Kind; 667 668 // Pretend that we do not recognize the directive. 669 } 670 } 671 } 672 673 return tgtok::Error; 674 } 675 676 bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) { 677 TokStart = CurPtr; 678 679 for (const auto &PD : PreprocessorDirs) 680 if (PD.Kind == Kind) { 681 // Advance CurPtr to the end of the preprocessing word. 682 CurPtr += strlen(PD.Word); 683 return true; 684 } 685 686 PrintFatalError("Unsupported preprocessing token in " 687 "prepEatPreprocessorDirective()"); 688 return false; 689 } 690 691 tgtok::TokKind TGLexer::lexPreprocessor( 692 tgtok::TokKind Kind, bool ReturnNextLiveToken) { 693 694 // We must be looking at a preprocessing directive. Eat it! 695 if (!prepEatPreprocessorDirective(Kind)) 696 PrintFatalError("lexPreprocessor() called for unknown " 697 "preprocessor directive"); 698 699 if (Kind == tgtok::Ifdef || Kind == tgtok::Ifndef) { 700 StringRef MacroName = prepLexMacroName(); 701 StringRef IfTokName = Kind == tgtok::Ifdef ? "#ifdef" : "#ifndef"; 702 if (MacroName.empty()) 703 return ReturnError(TokStart, "Expected macro name after " + IfTokName); 704 705 bool MacroIsDefined = DefinedMacros.count(MacroName) != 0; 706 707 // Canonicalize ifndef to ifdef equivalent 708 if (Kind == tgtok::Ifndef) { 709 MacroIsDefined = !MacroIsDefined; 710 Kind = tgtok::Ifdef; 711 } 712 713 // Regardless of whether we are processing tokens or not, 714 // we put the #ifdef control on stack. 715 PrepIncludeStack.back()->push_back( 716 {Kind, MacroIsDefined, SMLoc::getFromPointer(TokStart)}); 717 718 if (!prepSkipDirectiveEnd()) 719 return ReturnError(CurPtr, "Only comments are supported after " + 720 IfTokName + " NAME"); 721 722 // If we were not processing tokens before this #ifdef, 723 // then just return back to the lines skipping code. 724 if (!ReturnNextLiveToken) 725 return Kind; 726 727 // If we were processing tokens before this #ifdef, 728 // and the macro is defined, then just return the next token. 729 if (MacroIsDefined) 730 return LexToken(); 731 732 // We were processing tokens before this #ifdef, and the macro 733 // is not defined, so we have to start skipping the lines. 734 // If the skipping is successful, it will return the token following 735 // either #else or #endif corresponding to this #ifdef. 736 if (prepSkipRegion(ReturnNextLiveToken)) 737 return LexToken(); 738 739 return tgtok::Error; 740 } else if (Kind == tgtok::Else) { 741 // Check if this #else is correct before calling prepSkipDirectiveEnd(), 742 // which will move CurPtr away from the beginning of #else. 743 if (PrepIncludeStack.back()->empty()) 744 return ReturnError(TokStart, "#else without #ifdef or #ifndef"); 745 746 PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back()->back(); 747 748 if (IfdefEntry.Kind != tgtok::Ifdef) { 749 PrintError(TokStart, "double #else"); 750 return ReturnError(IfdefEntry.SrcPos, "Previous #else is here"); 751 } 752 753 // Replace the corresponding #ifdef's control with its negation 754 // on the control stack. 755 PrepIncludeStack.back()->pop_back(); 756 PrepIncludeStack.back()->push_back( 757 {Kind, !IfdefEntry.IsDefined, SMLoc::getFromPointer(TokStart)}); 758 759 if (!prepSkipDirectiveEnd()) 760 return ReturnError(CurPtr, "Only comments are supported after #else"); 761 762 // If we were processing tokens before this #else, 763 // we have to start skipping lines until the matching #endif. 764 if (ReturnNextLiveToken) { 765 if (prepSkipRegion(ReturnNextLiveToken)) 766 return LexToken(); 767 768 return tgtok::Error; 769 } 770 771 // Return to the lines skipping code. 772 return Kind; 773 } else if (Kind == tgtok::Endif) { 774 // Check if this #endif is correct before calling prepSkipDirectiveEnd(), 775 // which will move CurPtr away from the beginning of #endif. 776 if (PrepIncludeStack.back()->empty()) 777 return ReturnError(TokStart, "#endif without #ifdef"); 778 779 auto &IfdefOrElseEntry = PrepIncludeStack.back()->back(); 780 781 if (IfdefOrElseEntry.Kind != tgtok::Ifdef && 782 IfdefOrElseEntry.Kind != tgtok::Else) { 783 PrintFatalError("Invalid preprocessor control on the stack"); 784 return tgtok::Error; 785 } 786 787 if (!prepSkipDirectiveEnd()) 788 return ReturnError(CurPtr, "Only comments are supported after #endif"); 789 790 PrepIncludeStack.back()->pop_back(); 791 792 // If we were processing tokens before this #endif, then 793 // we should continue it. 794 if (ReturnNextLiveToken) { 795 return LexToken(); 796 } 797 798 // Return to the lines skipping code. 799 return Kind; 800 } else if (Kind == tgtok::Define) { 801 StringRef MacroName = prepLexMacroName(); 802 if (MacroName.empty()) 803 return ReturnError(TokStart, "Expected macro name after #define"); 804 805 if (!DefinedMacros.insert(MacroName).second) 806 PrintWarning(getLoc(), 807 "Duplicate definition of macro: " + Twine(MacroName)); 808 809 if (!prepSkipDirectiveEnd()) 810 return ReturnError(CurPtr, 811 "Only comments are supported after #define NAME"); 812 813 if (!ReturnNextLiveToken) { 814 PrintFatalError("#define must be ignored during the lines skipping"); 815 return tgtok::Error; 816 } 817 818 return LexToken(); 819 } 820 821 PrintFatalError("Preprocessing directive is not supported"); 822 return tgtok::Error; 823 } 824 825 bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) { 826 if (!MustNeverBeFalse) 827 PrintFatalError("Invalid recursion."); 828 829 do { 830 // Skip all symbols to the line end. 831 prepSkipToLineEnd(); 832 833 // Find the first non-whitespace symbol in the next line(s). 834 if (!prepSkipLineBegin()) 835 return false; 836 837 // If the first non-blank/comment symbol on the line is '#', 838 // it may be a start of preprocessing directive. 839 // 840 // If it is not '#' just go to the next line. 841 if (*CurPtr == '#') 842 ++CurPtr; 843 else 844 continue; 845 846 tgtok::TokKind Kind = prepIsDirective(); 847 848 // If we did not find a preprocessing directive or it is #define, 849 // then just skip to the next line. We do not have to do anything 850 // for #define in the line-skipping mode. 851 if (Kind == tgtok::Error || Kind == tgtok::Define) 852 continue; 853 854 tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, false); 855 856 // If lexPreprocessor() encountered an error during lexing this 857 // preprocessor idiom, then return false to the calling lexPreprocessor(). 858 // This will force tgtok::Error to be returned to the tokens processing. 859 if (ProcessedKind == tgtok::Error) 860 return false; 861 862 if (Kind != ProcessedKind) 863 PrintFatalError("prepIsDirective() and lexPreprocessor() " 864 "returned different token kinds"); 865 866 // If this preprocessing directive enables tokens processing, 867 // then return to the lexPreprocessor() and get to the next token. 868 // We can move from line-skipping mode to processing tokens only 869 // due to #else or #endif. 870 if (prepIsProcessingEnabled()) { 871 if (Kind != tgtok::Else && Kind != tgtok::Endif) { 872 PrintFatalError("Tokens processing was enabled by an unexpected " 873 "preprocessing directive"); 874 return false; 875 } 876 877 return true; 878 } 879 } while (CurPtr != CurBuf.end()); 880 881 // We have reached the end of the file, but never left the lines-skipping 882 // mode. This means there is no matching #endif. 883 prepReportPreprocessorStackError(); 884 return false; 885 } 886 887 StringRef TGLexer::prepLexMacroName() { 888 // Skip whitespaces between the preprocessing directive and the macro name. 889 while (*CurPtr == ' ' || *CurPtr == '\t') 890 ++CurPtr; 891 892 TokStart = CurPtr; 893 // Macro names start with [a-zA-Z_]. 894 if (*CurPtr != '_' && !isalpha(*CurPtr)) 895 return ""; 896 897 // Match the rest of the identifier regex: [0-9a-zA-Z_]* 898 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 899 ++CurPtr; 900 901 return StringRef(TokStart, CurPtr - TokStart); 902 } 903 904 bool TGLexer::prepSkipLineBegin() { 905 while (CurPtr != CurBuf.end()) { 906 switch (*CurPtr) { 907 case ' ': 908 case '\t': 909 case '\n': 910 case '\r': 911 break; 912 913 case '/': { 914 int NextChar = peekNextChar(1); 915 if (NextChar == '*') { 916 // Skip C-style comment. 917 // Note that we do not care about skipping the C++-style comments. 918 // If the line contains "//", it may not contain any processable 919 // preprocessing directive. Just return CurPtr pointing to 920 // the first '/' in this case. We also do not care about 921 // incorrect symbols after the first '/' - we are in lines-skipping 922 // mode, so incorrect code is allowed to some extent. 923 924 // Set TokStart to the beginning of the comment to enable proper 925 // diagnostic printing in case of error in SkipCComment(). 926 TokStart = CurPtr; 927 928 // CurPtr must point to '*' before call to SkipCComment(). 929 ++CurPtr; 930 if (SkipCComment()) 931 return false; 932 } else { 933 // CurPtr points to the non-whitespace '/'. 934 return true; 935 } 936 937 // We must not increment CurPtr after the comment was lexed. 938 continue; 939 } 940 941 default: 942 return true; 943 } 944 945 ++CurPtr; 946 } 947 948 // We have reached the end of the file. Return to the lines skipping 949 // code, and allow it to handle the EOF as needed. 950 return true; 951 } 952 953 bool TGLexer::prepSkipDirectiveEnd() { 954 while (CurPtr != CurBuf.end()) { 955 switch (*CurPtr) { 956 case ' ': 957 case '\t': 958 break; 959 960 case '\n': 961 case '\r': 962 return true; 963 964 case '/': { 965 int NextChar = peekNextChar(1); 966 if (NextChar == '/') { 967 // Skip C++-style comment. 968 // We may just return true now, but let's skip to the line/buffer end 969 // to simplify the method specification. 970 ++CurPtr; 971 SkipBCPLComment(); 972 } else if (NextChar == '*') { 973 // When we are skipping C-style comment at the end of a preprocessing 974 // directive, we can skip several lines. If any meaningful TD token 975 // follows the end of the C-style comment on the same line, it will 976 // be considered as an invalid usage of TD token. 977 // For example, we want to forbid usages like this one: 978 // #define MACRO class Class {} 979 // But with C-style comments we also disallow the following: 980 // #define MACRO /* This macro is used 981 // to ... */ class Class {} 982 // One can argue that this should be allowed, but it does not seem 983 // to be worth of the complication. Moreover, this matches 984 // the C preprocessor behavior. 985 986 // Set TokStart to the beginning of the comment to enable proper 987 // diagnostic printer in case of error in SkipCComment(). 988 TokStart = CurPtr; 989 ++CurPtr; 990 if (SkipCComment()) 991 return false; 992 } else { 993 TokStart = CurPtr; 994 PrintError(CurPtr, "Unexpected character"); 995 return false; 996 } 997 998 // We must not increment CurPtr after the comment was lexed. 999 continue; 1000 } 1001 1002 default: 1003 // Do not allow any non-whitespaces after the directive. 1004 TokStart = CurPtr; 1005 return false; 1006 } 1007 1008 ++CurPtr; 1009 } 1010 1011 return true; 1012 } 1013 1014 void TGLexer::prepSkipToLineEnd() { 1015 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) 1016 ++CurPtr; 1017 } 1018 1019 bool TGLexer::prepIsProcessingEnabled() { 1020 for (const PreprocessorControlDesc &I : 1021 llvm::reverse(*PrepIncludeStack.back())) 1022 if (!I.IsDefined) 1023 return false; 1024 1025 return true; 1026 } 1027 1028 void TGLexer::prepReportPreprocessorStackError() { 1029 if (PrepIncludeStack.back()->empty()) 1030 PrintFatalError("prepReportPreprocessorStackError() called with " 1031 "empty control stack"); 1032 1033 auto &PrepControl = PrepIncludeStack.back()->back(); 1034 PrintError(CurBuf.end(), "Reached EOF without matching #endif"); 1035 PrintError(PrepControl.SrcPos, "The latest preprocessor control is here"); 1036 1037 TokStart = CurPtr; 1038 } 1039