1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implement the Lexer for TableGen. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "TGLexer.h" 14 #include "llvm/ADT/ArrayRef.h" 15 #include "llvm/ADT/StringSwitch.h" 16 #include "llvm/ADT/Twine.h" 17 #include "llvm/Config/config.h" // for strtoull()/strtoll() define 18 #include "llvm/Support/Compiler.h" 19 #include "llvm/Support/MemoryBuffer.h" 20 #include "llvm/Support/SourceMgr.h" 21 #include "llvm/TableGen/Error.h" 22 #include <algorithm> 23 #include <cctype> 24 #include <cerrno> 25 #include <cstdint> 26 #include <cstdio> 27 #include <cstdlib> 28 #include <cstring> 29 30 using namespace llvm; 31 32 namespace { 33 // A list of supported preprocessing directives with their 34 // internal token kinds and names. 35 struct { 36 tgtok::TokKind Kind; 37 const char *Word; 38 } PreprocessorDirs[] = { 39 { tgtok::Ifdef, "ifdef" }, 40 { tgtok::Ifndef, "ifndef" }, 41 { tgtok::Else, "else" }, 42 { tgtok::Endif, "endif" }, 43 { tgtok::Define, "define" } 44 }; 45 } // end anonymous namespace 46 47 TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) { 48 CurBuffer = SrcMgr.getMainFileID(); 49 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 50 CurPtr = CurBuf.begin(); 51 TokStart = nullptr; 52 53 // Pretend that we enter the "top-level" include file. 54 PrepIncludeStack.push_back( 55 std::make_unique<std::vector<PreprocessorControlDesc>>()); 56 57 // Put all macros defined in the command line into the DefinedMacros set. 58 for (const std::string &MacroName : Macros) 59 DefinedMacros.insert(MacroName); 60 } 61 62 SMLoc TGLexer::getLoc() const { 63 return SMLoc::getFromPointer(TokStart); 64 } 65 66 SMRange TGLexer::getLocRange() const { 67 return {getLoc(), SMLoc::getFromPointer(CurPtr)}; 68 } 69 70 /// ReturnError - Set the error to the specified string at the specified 71 /// location. This is defined to always return tgtok::Error. 72 tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) { 73 PrintError(Loc, Msg); 74 return tgtok::Error; 75 } 76 77 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { 78 return ReturnError(SMLoc::getFromPointer(Loc), Msg); 79 } 80 81 bool TGLexer::processEOF() { 82 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); 83 if (ParentIncludeLoc != SMLoc()) { 84 // If prepExitInclude() detects a problem with the preprocessing 85 // control stack, it will return false. Pretend that we reached 86 // the final EOF and stop lexing more tokens by returning false 87 // to LexToken(). 88 if (!prepExitInclude(false)) 89 return false; 90 91 CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); 92 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 93 CurPtr = ParentIncludeLoc.getPointer(); 94 // Make sure TokStart points into the parent file's buffer. 95 // LexToken() assigns to it before calling getNextChar(), 96 // so it is pointing into the included file now. 97 TokStart = CurPtr; 98 return true; 99 } 100 101 // Pretend that we exit the "top-level" include file. 102 // Note that in case of an error (e.g. control stack imbalance) 103 // the routine will issue a fatal error. 104 prepExitInclude(true); 105 return false; 106 } 107 108 int TGLexer::getNextChar() { 109 char CurChar = *CurPtr++; 110 switch (CurChar) { 111 default: 112 return (unsigned char)CurChar; 113 114 case 0: { 115 // A NUL character in the stream is either the end of the current buffer or 116 // a spurious NUL in the file. Disambiguate that here. 117 if (CurPtr - 1 == CurBuf.end()) { 118 --CurPtr; // Arrange for another call to return EOF again. 119 return EOF; 120 } 121 PrintError(getLoc(), 122 "NUL character is invalid in source; treated as space"); 123 return ' '; 124 } 125 126 case '\n': 127 case '\r': 128 // Handle the newline character by ignoring it and incrementing the line 129 // count. However, be careful about 'dos style' files with \n\r in them. 130 // Only treat a \n\r or \r\n as a single line. 131 if ((*CurPtr == '\n' || (*CurPtr == '\r')) && 132 *CurPtr != CurChar) 133 ++CurPtr; // Eat the two char newline sequence. 134 return '\n'; 135 } 136 } 137 138 int TGLexer::peekNextChar(int Index) const { 139 return *(CurPtr + Index); 140 } 141 142 tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) { 143 TokStart = CurPtr; 144 // This always consumes at least one character. 145 int CurChar = getNextChar(); 146 147 switch (CurChar) { 148 default: 149 // Handle letters: [a-zA-Z_] 150 if (isalpha(CurChar) || CurChar == '_') 151 return LexIdentifier(); 152 153 // Unknown character, emit an error. 154 return ReturnError(TokStart, "Unexpected character"); 155 case EOF: 156 // Lex next token, if we just left an include file. 157 // Note that leaving an include file means that the next 158 // symbol is located at the end of the 'include "..."' 159 // construct, so LexToken() is called with default 160 // false parameter. 161 if (processEOF()) 162 return LexToken(); 163 164 // Return EOF denoting the end of lexing. 165 return tgtok::Eof; 166 167 case ':': return tgtok::colon; 168 case ';': return tgtok::semi; 169 case ',': return tgtok::comma; 170 case '<': return tgtok::less; 171 case '>': return tgtok::greater; 172 case ']': return tgtok::r_square; 173 case '{': return tgtok::l_brace; 174 case '}': return tgtok::r_brace; 175 case '(': return tgtok::l_paren; 176 case ')': return tgtok::r_paren; 177 case '=': return tgtok::equal; 178 case '?': return tgtok::question; 179 case '#': 180 if (FileOrLineStart) { 181 tgtok::TokKind Kind = prepIsDirective(); 182 if (Kind != tgtok::Error) 183 return lexPreprocessor(Kind); 184 } 185 186 return tgtok::paste; 187 188 // The period is a separate case so we can recognize the "..." 189 // range punctuator. 190 case '.': 191 if (peekNextChar(0) == '.') { 192 ++CurPtr; // Eat second dot. 193 if (peekNextChar(0) == '.') { 194 ++CurPtr; // Eat third dot. 195 return tgtok::dotdotdot; 196 } 197 return ReturnError(TokStart, "Invalid '..' punctuation"); 198 } 199 return tgtok::dot; 200 201 case '\r': 202 PrintFatalError("getNextChar() must never return '\r'"); 203 return tgtok::Error; 204 205 case ' ': 206 case '\t': 207 // Ignore whitespace. 208 return LexToken(FileOrLineStart); 209 case '\n': 210 // Ignore whitespace, and identify the new line. 211 return LexToken(true); 212 case '/': 213 // If this is the start of a // comment, skip until the end of the line or 214 // the end of the buffer. 215 if (*CurPtr == '/') 216 SkipBCPLComment(); 217 else if (*CurPtr == '*') { 218 if (SkipCComment()) 219 return tgtok::Error; 220 } else // Otherwise, this is an error. 221 return ReturnError(TokStart, "Unexpected character"); 222 return LexToken(FileOrLineStart); 223 case '-': case '+': 224 case '0': case '1': case '2': case '3': case '4': case '5': case '6': 225 case '7': case '8': case '9': { 226 int NextChar = 0; 227 if (isdigit(CurChar)) { 228 // Allow identifiers to start with a number if it is followed by 229 // an identifier. This can happen with paste operations like 230 // foo#8i. 231 int i = 0; 232 do { 233 NextChar = peekNextChar(i++); 234 } while (isdigit(NextChar)); 235 236 if (NextChar == 'x' || NextChar == 'b') { 237 // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most 238 // likely a number. 239 int NextNextChar = peekNextChar(i); 240 switch (NextNextChar) { 241 default: 242 break; 243 case '0': case '1': 244 if (NextChar == 'b') 245 return LexNumber(); 246 [[fallthrough]]; 247 case '2': case '3': case '4': case '5': 248 case '6': case '7': case '8': case '9': 249 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 250 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 251 if (NextChar == 'x') 252 return LexNumber(); 253 break; 254 } 255 } 256 } 257 258 if (isalpha(NextChar) || NextChar == '_') 259 return LexIdentifier(); 260 261 return LexNumber(); 262 } 263 case '"': return LexString(); 264 case '$': return LexVarName(); 265 case '[': return LexBracket(); 266 case '!': return LexExclaim(); 267 } 268 } 269 270 /// LexString - Lex "[^"]*" 271 tgtok::TokKind TGLexer::LexString() { 272 const char *StrStart = CurPtr; 273 274 CurStrVal = ""; 275 276 while (*CurPtr != '"') { 277 // If we hit the end of the buffer, report an error. 278 if (*CurPtr == 0 && CurPtr == CurBuf.end()) 279 return ReturnError(StrStart, "End of file in string literal"); 280 281 if (*CurPtr == '\n' || *CurPtr == '\r') 282 return ReturnError(StrStart, "End of line in string literal"); 283 284 if (*CurPtr != '\\') { 285 CurStrVal += *CurPtr++; 286 continue; 287 } 288 289 ++CurPtr; 290 291 switch (*CurPtr) { 292 case '\\': case '\'': case '"': 293 // These turn into their literal character. 294 CurStrVal += *CurPtr++; 295 break; 296 case 't': 297 CurStrVal += '\t'; 298 ++CurPtr; 299 break; 300 case 'n': 301 CurStrVal += '\n'; 302 ++CurPtr; 303 break; 304 305 case '\n': 306 case '\r': 307 return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); 308 309 // If we hit the end of the buffer, report an error. 310 case '\0': 311 if (CurPtr == CurBuf.end()) 312 return ReturnError(StrStart, "End of file in string literal"); 313 [[fallthrough]]; 314 default: 315 return ReturnError(CurPtr, "invalid escape in string literal"); 316 } 317 } 318 319 ++CurPtr; 320 return tgtok::StrVal; 321 } 322 323 tgtok::TokKind TGLexer::LexVarName() { 324 if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') 325 return ReturnError(TokStart, "Invalid variable name"); 326 327 // Otherwise, we're ok, consume the rest of the characters. 328 const char *VarNameStart = CurPtr++; 329 330 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 331 ++CurPtr; 332 333 CurStrVal.assign(VarNameStart, CurPtr); 334 return tgtok::VarName; 335 } 336 337 tgtok::TokKind TGLexer::LexIdentifier() { 338 // The first letter is [a-zA-Z_]. 339 const char *IdentStart = TokStart; 340 341 // Match the rest of the identifier regex: [0-9a-zA-Z_]* 342 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 343 ++CurPtr; 344 345 // Check to see if this identifier is a reserved keyword. 346 StringRef Str(IdentStart, CurPtr-IdentStart); 347 348 tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) 349 .Case("int", tgtok::Int) 350 .Case("bit", tgtok::Bit) 351 .Case("bits", tgtok::Bits) 352 .Case("string", tgtok::String) 353 .Case("list", tgtok::List) 354 .Case("code", tgtok::Code) 355 .Case("dag", tgtok::Dag) 356 .Case("class", tgtok::Class) 357 .Case("def", tgtok::Def) 358 .Case("true", tgtok::TrueVal) 359 .Case("false", tgtok::FalseVal) 360 .Case("foreach", tgtok::Foreach) 361 .Case("defm", tgtok::Defm) 362 .Case("defset", tgtok::Defset) 363 .Case("multiclass", tgtok::MultiClass) 364 .Case("field", tgtok::Field) 365 .Case("let", tgtok::Let) 366 .Case("in", tgtok::In) 367 .Case("defvar", tgtok::Defvar) 368 .Case("include", tgtok::Include) 369 .Case("if", tgtok::If) 370 .Case("then", tgtok::Then) 371 .Case("else", tgtok::ElseKW) 372 .Case("assert", tgtok::Assert) 373 .Case("dump", tgtok::Dump) 374 .Default(tgtok::Id); 375 376 // A couple of tokens require special processing. 377 switch (Kind) { 378 case tgtok::Include: 379 if (LexInclude()) return tgtok::Error; 380 return Lex(); 381 case tgtok::Id: 382 CurStrVal.assign(Str.begin(), Str.end()); 383 break; 384 default: 385 break; 386 } 387 388 return Kind; 389 } 390 391 /// LexInclude - We just read the "include" token. Get the string token that 392 /// comes next and enter the include. 393 bool TGLexer::LexInclude() { 394 // The token after the include must be a string. 395 tgtok::TokKind Tok = LexToken(); 396 if (Tok == tgtok::Error) return true; 397 if (Tok != tgtok::StrVal) { 398 PrintError(getLoc(), "Expected filename after include"); 399 return true; 400 } 401 402 // Get the string. 403 std::string Filename = CurStrVal; 404 std::string IncludedFile; 405 406 CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), 407 IncludedFile); 408 if (!CurBuffer) { 409 PrintError(getLoc(), "Could not find include file '" + Filename + "'"); 410 return true; 411 } 412 413 Dependencies.insert(IncludedFile); 414 // Save the line number and lex buffer of the includer. 415 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 416 CurPtr = CurBuf.begin(); 417 418 PrepIncludeStack.push_back( 419 std::make_unique<std::vector<PreprocessorControlDesc>>()); 420 return false; 421 } 422 423 /// SkipBCPLComment - Skip over the comment by finding the next CR or LF. 424 /// Or we may end up at the end of the buffer. 425 void TGLexer::SkipBCPLComment() { 426 ++CurPtr; // skip the second slash. 427 auto EOLPos = CurBuf.find_first_of("\r\n", CurPtr - CurBuf.data()); 428 CurPtr = (EOLPos == StringRef::npos) ? CurBuf.end() : CurBuf.data() + EOLPos; 429 } 430 431 /// SkipCComment - This skips C-style /**/ comments. The only difference from C 432 /// is that we allow nesting. 433 bool TGLexer::SkipCComment() { 434 ++CurPtr; // skip the star. 435 unsigned CommentDepth = 1; 436 437 while (true) { 438 int CurChar = getNextChar(); 439 switch (CurChar) { 440 case EOF: 441 PrintError(TokStart, "Unterminated comment!"); 442 return true; 443 case '*': 444 // End of the comment? 445 if (CurPtr[0] != '/') break; 446 447 ++CurPtr; // End the */. 448 if (--CommentDepth == 0) 449 return false; 450 break; 451 case '/': 452 // Start of a nested comment? 453 if (CurPtr[0] != '*') break; 454 ++CurPtr; 455 ++CommentDepth; 456 break; 457 } 458 } 459 } 460 461 /// LexNumber - Lex: 462 /// [-+]?[0-9]+ 463 /// 0x[0-9a-fA-F]+ 464 /// 0b[01]+ 465 tgtok::TokKind TGLexer::LexNumber() { 466 unsigned Base = 0; 467 const char *NumStart; 468 469 // Check if it's a hex or a binary value. 470 if (CurPtr[-1] == '0') { 471 NumStart = CurPtr + 1; 472 if (CurPtr[0] == 'x') { 473 Base = 16; 474 do 475 ++CurPtr; 476 while (isxdigit(CurPtr[0])); 477 } else if (CurPtr[0] == 'b') { 478 Base = 2; 479 do 480 ++CurPtr; 481 while (CurPtr[0] == '0' || CurPtr[0] == '1'); 482 } 483 } 484 485 // For a hex or binary value, we always convert it to an unsigned value. 486 bool IsMinus = false; 487 488 // Check if it's a decimal value. 489 if (Base == 0) { 490 // Check for a sign without a digit. 491 if (!isdigit(CurPtr[0])) { 492 if (CurPtr[-1] == '-') 493 return tgtok::minus; 494 else if (CurPtr[-1] == '+') 495 return tgtok::plus; 496 } 497 498 Base = 10; 499 NumStart = TokStart; 500 IsMinus = CurPtr[-1] == '-'; 501 502 while (isdigit(CurPtr[0])) 503 ++CurPtr; 504 } 505 506 // Requires at least one digit. 507 if (CurPtr == NumStart) 508 return ReturnError(TokStart, "Invalid number"); 509 510 errno = 0; 511 if (IsMinus) 512 CurIntVal = strtoll(NumStart, nullptr, Base); 513 else 514 CurIntVal = strtoull(NumStart, nullptr, Base); 515 516 if (errno == EINVAL) 517 return ReturnError(TokStart, "Invalid number"); 518 if (errno == ERANGE) 519 return ReturnError(TokStart, "Number out of range"); 520 521 return Base == 2 ? tgtok::BinaryIntVal : tgtok::IntVal; 522 } 523 524 /// LexBracket - We just read '['. If this is a code block, return it, 525 /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' 526 tgtok::TokKind TGLexer::LexBracket() { 527 if (CurPtr[0] != '{') 528 return tgtok::l_square; 529 ++CurPtr; 530 const char *CodeStart = CurPtr; 531 while (true) { 532 int Char = getNextChar(); 533 if (Char == EOF) break; 534 535 if (Char != '}') continue; 536 537 Char = getNextChar(); 538 if (Char == EOF) break; 539 if (Char == ']') { 540 CurStrVal.assign(CodeStart, CurPtr-2); 541 return tgtok::CodeFragment; 542 } 543 } 544 545 return ReturnError(CodeStart - 2, "Unterminated code block"); 546 } 547 548 /// LexExclaim - Lex '!' and '![a-zA-Z]+'. 549 tgtok::TokKind TGLexer::LexExclaim() { 550 if (!isalpha(*CurPtr)) 551 return ReturnError(CurPtr - 1, "Invalid \"!operator\""); 552 553 const char *Start = CurPtr++; 554 while (isalpha(*CurPtr)) 555 ++CurPtr; 556 557 // Check to see which operator this is. 558 tgtok::TokKind Kind = 559 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) 560 .Case("eq", tgtok::XEq) 561 .Case("ne", tgtok::XNe) 562 .Case("le", tgtok::XLe) 563 .Case("lt", tgtok::XLt) 564 .Case("ge", tgtok::XGe) 565 .Case("gt", tgtok::XGt) 566 .Case("if", tgtok::XIf) 567 .Case("cond", tgtok::XCond) 568 .Case("isa", tgtok::XIsA) 569 .Case("head", tgtok::XHead) 570 .Case("tail", tgtok::XTail) 571 .Case("size", tgtok::XSize) 572 .Case("con", tgtok::XConcat) 573 .Case("dag", tgtok::XDag) 574 .Case("add", tgtok::XADD) 575 .Case("sub", tgtok::XSUB) 576 .Case("mul", tgtok::XMUL) 577 .Case("div", tgtok::XDIV) 578 .Case("not", tgtok::XNOT) 579 .Case("logtwo", tgtok::XLOG2) 580 .Case("and", tgtok::XAND) 581 .Case("or", tgtok::XOR) 582 .Case("xor", tgtok::XXOR) 583 .Case("shl", tgtok::XSHL) 584 .Case("sra", tgtok::XSRA) 585 .Case("srl", tgtok::XSRL) 586 .Case("cast", tgtok::XCast) 587 .Case("empty", tgtok::XEmpty) 588 .Case("subst", tgtok::XSubst) 589 .Case("foldl", tgtok::XFoldl) 590 .Case("foreach", tgtok::XForEach) 591 .Case("filter", tgtok::XFilter) 592 .Case("listconcat", tgtok::XListConcat) 593 .Case("listsplat", tgtok::XListSplat) 594 .Case("listremove", tgtok::XListRemove) 595 .Case("range", tgtok::XRange) 596 .Case("strconcat", tgtok::XStrConcat) 597 .Case("interleave", tgtok::XInterleave) 598 .Case("substr", tgtok::XSubstr) 599 .Case("find", tgtok::XFind) 600 .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated. 601 .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated. 602 .Case("getdagarg", tgtok::XGetDagArg) 603 .Case("getdagname", tgtok::XGetDagName) 604 .Case("setdagarg", tgtok::XSetDagArg) 605 .Case("setdagname", tgtok::XSetDagName) 606 .Case("exists", tgtok::XExists) 607 .Case("tolower", tgtok::XToLower) 608 .Case("toupper", tgtok::XToUpper) 609 .Case("repr", tgtok::XRepr) 610 .Default(tgtok::Error); 611 612 return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); 613 } 614 615 bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) { 616 // Report an error, if preprocessor control stack for the current 617 // file is not empty. 618 if (!PrepIncludeStack.back()->empty()) { 619 prepReportPreprocessorStackError(); 620 621 return false; 622 } 623 624 // Pop the preprocessing controls from the include stack. 625 if (PrepIncludeStack.empty()) { 626 PrintFatalError("Preprocessor include stack is empty"); 627 } 628 629 PrepIncludeStack.pop_back(); 630 631 if (IncludeStackMustBeEmpty) { 632 if (!PrepIncludeStack.empty()) 633 PrintFatalError("Preprocessor include stack is not empty"); 634 } else { 635 if (PrepIncludeStack.empty()) 636 PrintFatalError("Preprocessor include stack is empty"); 637 } 638 639 return true; 640 } 641 642 tgtok::TokKind TGLexer::prepIsDirective() const { 643 for (const auto &PD : PreprocessorDirs) { 644 int NextChar = *CurPtr; 645 bool Match = true; 646 unsigned I = 0; 647 for (; I < strlen(PD.Word); ++I) { 648 if (NextChar != PD.Word[I]) { 649 Match = false; 650 break; 651 } 652 653 NextChar = peekNextChar(I + 1); 654 } 655 656 // Check for whitespace after the directive. If there is no whitespace, 657 // then we do not recognize it as a preprocessing directive. 658 if (Match) { 659 tgtok::TokKind Kind = PD.Kind; 660 661 // New line and EOF may follow only #else/#endif. It will be reported 662 // as an error for #ifdef/#define after the call to prepLexMacroName(). 663 if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF || 664 NextChar == '\n' || 665 // It looks like TableGen does not support '\r' as the actual 666 // carriage return, e.g. getNextChar() treats a single '\r' 667 // as '\n'. So we do the same here. 668 NextChar == '\r') 669 return Kind; 670 671 // Allow comments after some directives, e.g.: 672 // #else// OR #else/**/ 673 // #endif// OR #endif/**/ 674 // 675 // Note that we do allow comments after #ifdef/#define here, e.g. 676 // #ifdef/**/ AND #ifdef// 677 // #define/**/ AND #define// 678 // 679 // These cases will be reported as incorrect after calling 680 // prepLexMacroName(). We could have supported C-style comments 681 // after #ifdef/#define, but this would complicate the code 682 // for little benefit. 683 if (NextChar == '/') { 684 NextChar = peekNextChar(I + 1); 685 686 if (NextChar == '*' || NextChar == '/') 687 return Kind; 688 689 // Pretend that we do not recognize the directive. 690 } 691 } 692 } 693 694 return tgtok::Error; 695 } 696 697 bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) { 698 TokStart = CurPtr; 699 700 for (const auto &PD : PreprocessorDirs) 701 if (PD.Kind == Kind) { 702 // Advance CurPtr to the end of the preprocessing word. 703 CurPtr += strlen(PD.Word); 704 return true; 705 } 706 707 PrintFatalError("Unsupported preprocessing token in " 708 "prepEatPreprocessorDirective()"); 709 return false; 710 } 711 712 tgtok::TokKind TGLexer::lexPreprocessor( 713 tgtok::TokKind Kind, bool ReturnNextLiveToken) { 714 715 // We must be looking at a preprocessing directive. Eat it! 716 if (!prepEatPreprocessorDirective(Kind)) 717 PrintFatalError("lexPreprocessor() called for unknown " 718 "preprocessor directive"); 719 720 if (Kind == tgtok::Ifdef || Kind == tgtok::Ifndef) { 721 StringRef MacroName = prepLexMacroName(); 722 StringRef IfTokName = Kind == tgtok::Ifdef ? "#ifdef" : "#ifndef"; 723 if (MacroName.empty()) 724 return ReturnError(TokStart, "Expected macro name after " + IfTokName); 725 726 bool MacroIsDefined = DefinedMacros.count(MacroName) != 0; 727 728 // Canonicalize ifndef's MacroIsDefined to its ifdef equivalent. 729 if (Kind == tgtok::Ifndef) 730 MacroIsDefined = !MacroIsDefined; 731 732 // Regardless of whether we are processing tokens or not, 733 // we put the #ifdef control on stack. 734 // Note that MacroIsDefined has been canonicalized against ifdef. 735 PrepIncludeStack.back()->push_back( 736 {tgtok::Ifdef, MacroIsDefined, SMLoc::getFromPointer(TokStart)}); 737 738 if (!prepSkipDirectiveEnd()) 739 return ReturnError(CurPtr, "Only comments are supported after " + 740 IfTokName + " NAME"); 741 742 // If we were not processing tokens before this #ifdef, 743 // then just return back to the lines skipping code. 744 if (!ReturnNextLiveToken) 745 return Kind; 746 747 // If we were processing tokens before this #ifdef, 748 // and the macro is defined, then just return the next token. 749 if (MacroIsDefined) 750 return LexToken(); 751 752 // We were processing tokens before this #ifdef, and the macro 753 // is not defined, so we have to start skipping the lines. 754 // If the skipping is successful, it will return the token following 755 // either #else or #endif corresponding to this #ifdef. 756 if (prepSkipRegion(ReturnNextLiveToken)) 757 return LexToken(); 758 759 return tgtok::Error; 760 } else if (Kind == tgtok::Else) { 761 // Check if this #else is correct before calling prepSkipDirectiveEnd(), 762 // which will move CurPtr away from the beginning of #else. 763 if (PrepIncludeStack.back()->empty()) 764 return ReturnError(TokStart, "#else without #ifdef or #ifndef"); 765 766 PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back()->back(); 767 768 if (IfdefEntry.Kind != tgtok::Ifdef) { 769 PrintError(TokStart, "double #else"); 770 return ReturnError(IfdefEntry.SrcPos, "Previous #else is here"); 771 } 772 773 // Replace the corresponding #ifdef's control with its negation 774 // on the control stack. 775 PrepIncludeStack.back()->pop_back(); 776 PrepIncludeStack.back()->push_back( 777 {Kind, !IfdefEntry.IsDefined, SMLoc::getFromPointer(TokStart)}); 778 779 if (!prepSkipDirectiveEnd()) 780 return ReturnError(CurPtr, "Only comments are supported after #else"); 781 782 // If we were processing tokens before this #else, 783 // we have to start skipping lines until the matching #endif. 784 if (ReturnNextLiveToken) { 785 if (prepSkipRegion(ReturnNextLiveToken)) 786 return LexToken(); 787 788 return tgtok::Error; 789 } 790 791 // Return to the lines skipping code. 792 return Kind; 793 } else if (Kind == tgtok::Endif) { 794 // Check if this #endif is correct before calling prepSkipDirectiveEnd(), 795 // which will move CurPtr away from the beginning of #endif. 796 if (PrepIncludeStack.back()->empty()) 797 return ReturnError(TokStart, "#endif without #ifdef"); 798 799 auto &IfdefOrElseEntry = PrepIncludeStack.back()->back(); 800 801 if (IfdefOrElseEntry.Kind != tgtok::Ifdef && 802 IfdefOrElseEntry.Kind != tgtok::Else) { 803 PrintFatalError("Invalid preprocessor control on the stack"); 804 return tgtok::Error; 805 } 806 807 if (!prepSkipDirectiveEnd()) 808 return ReturnError(CurPtr, "Only comments are supported after #endif"); 809 810 PrepIncludeStack.back()->pop_back(); 811 812 // If we were processing tokens before this #endif, then 813 // we should continue it. 814 if (ReturnNextLiveToken) { 815 return LexToken(); 816 } 817 818 // Return to the lines skipping code. 819 return Kind; 820 } else if (Kind == tgtok::Define) { 821 StringRef MacroName = prepLexMacroName(); 822 if (MacroName.empty()) 823 return ReturnError(TokStart, "Expected macro name after #define"); 824 825 if (!DefinedMacros.insert(MacroName).second) 826 PrintWarning(getLoc(), 827 "Duplicate definition of macro: " + Twine(MacroName)); 828 829 if (!prepSkipDirectiveEnd()) 830 return ReturnError(CurPtr, 831 "Only comments are supported after #define NAME"); 832 833 if (!ReturnNextLiveToken) { 834 PrintFatalError("#define must be ignored during the lines skipping"); 835 return tgtok::Error; 836 } 837 838 return LexToken(); 839 } 840 841 PrintFatalError("Preprocessing directive is not supported"); 842 return tgtok::Error; 843 } 844 845 bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) { 846 if (!MustNeverBeFalse) 847 PrintFatalError("Invalid recursion."); 848 849 do { 850 // Skip all symbols to the line end. 851 prepSkipToLineEnd(); 852 853 // Find the first non-whitespace symbol in the next line(s). 854 if (!prepSkipLineBegin()) 855 return false; 856 857 // If the first non-blank/comment symbol on the line is '#', 858 // it may be a start of preprocessing directive. 859 // 860 // If it is not '#' just go to the next line. 861 if (*CurPtr == '#') 862 ++CurPtr; 863 else 864 continue; 865 866 tgtok::TokKind Kind = prepIsDirective(); 867 868 // If we did not find a preprocessing directive or it is #define, 869 // then just skip to the next line. We do not have to do anything 870 // for #define in the line-skipping mode. 871 if (Kind == tgtok::Error || Kind == tgtok::Define) 872 continue; 873 874 tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, false); 875 876 // If lexPreprocessor() encountered an error during lexing this 877 // preprocessor idiom, then return false to the calling lexPreprocessor(). 878 // This will force tgtok::Error to be returned to the tokens processing. 879 if (ProcessedKind == tgtok::Error) 880 return false; 881 882 if (Kind != ProcessedKind) 883 PrintFatalError("prepIsDirective() and lexPreprocessor() " 884 "returned different token kinds"); 885 886 // If this preprocessing directive enables tokens processing, 887 // then return to the lexPreprocessor() and get to the next token. 888 // We can move from line-skipping mode to processing tokens only 889 // due to #else or #endif. 890 if (prepIsProcessingEnabled()) { 891 if (Kind != tgtok::Else && Kind != tgtok::Endif) { 892 PrintFatalError("Tokens processing was enabled by an unexpected " 893 "preprocessing directive"); 894 return false; 895 } 896 897 return true; 898 } 899 } while (CurPtr != CurBuf.end()); 900 901 // We have reached the end of the file, but never left the lines-skipping 902 // mode. This means there is no matching #endif. 903 prepReportPreprocessorStackError(); 904 return false; 905 } 906 907 StringRef TGLexer::prepLexMacroName() { 908 // Skip whitespaces between the preprocessing directive and the macro name. 909 while (*CurPtr == ' ' || *CurPtr == '\t') 910 ++CurPtr; 911 912 TokStart = CurPtr; 913 // Macro names start with [a-zA-Z_]. 914 if (*CurPtr != '_' && !isalpha(*CurPtr)) 915 return ""; 916 917 // Match the rest of the identifier regex: [0-9a-zA-Z_]* 918 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 919 ++CurPtr; 920 921 return StringRef(TokStart, CurPtr - TokStart); 922 } 923 924 bool TGLexer::prepSkipLineBegin() { 925 while (CurPtr != CurBuf.end()) { 926 switch (*CurPtr) { 927 case ' ': 928 case '\t': 929 case '\n': 930 case '\r': 931 break; 932 933 case '/': { 934 int NextChar = peekNextChar(1); 935 if (NextChar == '*') { 936 // Skip C-style comment. 937 // Note that we do not care about skipping the C++-style comments. 938 // If the line contains "//", it may not contain any processable 939 // preprocessing directive. Just return CurPtr pointing to 940 // the first '/' in this case. We also do not care about 941 // incorrect symbols after the first '/' - we are in lines-skipping 942 // mode, so incorrect code is allowed to some extent. 943 944 // Set TokStart to the beginning of the comment to enable proper 945 // diagnostic printing in case of error in SkipCComment(). 946 TokStart = CurPtr; 947 948 // CurPtr must point to '*' before call to SkipCComment(). 949 ++CurPtr; 950 if (SkipCComment()) 951 return false; 952 } else { 953 // CurPtr points to the non-whitespace '/'. 954 return true; 955 } 956 957 // We must not increment CurPtr after the comment was lexed. 958 continue; 959 } 960 961 default: 962 return true; 963 } 964 965 ++CurPtr; 966 } 967 968 // We have reached the end of the file. Return to the lines skipping 969 // code, and allow it to handle the EOF as needed. 970 return true; 971 } 972 973 bool TGLexer::prepSkipDirectiveEnd() { 974 while (CurPtr != CurBuf.end()) { 975 switch (*CurPtr) { 976 case ' ': 977 case '\t': 978 break; 979 980 case '\n': 981 case '\r': 982 return true; 983 984 case '/': { 985 int NextChar = peekNextChar(1); 986 if (NextChar == '/') { 987 // Skip C++-style comment. 988 // We may just return true now, but let's skip to the line/buffer end 989 // to simplify the method specification. 990 ++CurPtr; 991 SkipBCPLComment(); 992 } else if (NextChar == '*') { 993 // When we are skipping C-style comment at the end of a preprocessing 994 // directive, we can skip several lines. If any meaningful TD token 995 // follows the end of the C-style comment on the same line, it will 996 // be considered as an invalid usage of TD token. 997 // For example, we want to forbid usages like this one: 998 // #define MACRO class Class {} 999 // But with C-style comments we also disallow the following: 1000 // #define MACRO /* This macro is used 1001 // to ... */ class Class {} 1002 // One can argue that this should be allowed, but it does not seem 1003 // to be worth of the complication. Moreover, this matches 1004 // the C preprocessor behavior. 1005 1006 // Set TokStart to the beginning of the comment to enable proper 1007 // diagnostic printer in case of error in SkipCComment(). 1008 TokStart = CurPtr; 1009 ++CurPtr; 1010 if (SkipCComment()) 1011 return false; 1012 } else { 1013 TokStart = CurPtr; 1014 PrintError(CurPtr, "Unexpected character"); 1015 return false; 1016 } 1017 1018 // We must not increment CurPtr after the comment was lexed. 1019 continue; 1020 } 1021 1022 default: 1023 // Do not allow any non-whitespaces after the directive. 1024 TokStart = CurPtr; 1025 return false; 1026 } 1027 1028 ++CurPtr; 1029 } 1030 1031 return true; 1032 } 1033 1034 void TGLexer::prepSkipToLineEnd() { 1035 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) 1036 ++CurPtr; 1037 } 1038 1039 bool TGLexer::prepIsProcessingEnabled() { 1040 for (const PreprocessorControlDesc &I : 1041 llvm::reverse(*PrepIncludeStack.back())) 1042 if (!I.IsDefined) 1043 return false; 1044 1045 return true; 1046 } 1047 1048 void TGLexer::prepReportPreprocessorStackError() { 1049 if (PrepIncludeStack.back()->empty()) 1050 PrintFatalError("prepReportPreprocessorStackError() called with " 1051 "empty control stack"); 1052 1053 auto &PrepControl = PrepIncludeStack.back()->back(); 1054 PrintError(CurBuf.end(), "Reached EOF without matching #endif"); 1055 PrintError(PrepControl.SrcPos, "The latest preprocessor control is here"); 1056 1057 TokStart = CurPtr; 1058 } 1059