1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This class represents the Lexer for tablegen files. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H 14 #define LLVM_LIB_TABLEGEN_TGLEXER_H 15 16 #include "llvm/ADT/StringRef.h" 17 #include "llvm/ADT/StringSet.h" 18 #include "llvm/Support/DataTypes.h" 19 #include "llvm/Support/SMLoc.h" 20 #include <cassert> 21 #include <memory> 22 #include <set> 23 #include <string> 24 #include <vector> 25 26 namespace llvm { 27 template <typename T> class ArrayRef; 28 class SourceMgr; 29 class Twine; 30 31 namespace tgtok { 32 enum TokKind { 33 // Markers 34 Eof, 35 Error, 36 37 // Tokens with no info. 38 minus, // - 39 plus, // + 40 l_square, // [ 41 r_square, // ] 42 l_brace, // { 43 r_brace, // } 44 l_paren, // ( 45 r_paren, // ) 46 less, // < 47 greater, // > 48 colon, // : 49 semi, // ; 50 comma, // , 51 dot, // . 52 equal, // = 53 question, // ? 54 paste, // # 55 dotdotdot, // ... 56 57 // Boolean literals. 58 TrueVal, 59 FalseVal, 60 61 // Integer value. 62 IntVal, 63 64 // Binary constant. Note that these are sized according to the number of 65 // bits given. 66 BinaryIntVal, 67 68 // Preprocessing tokens for internal usage by the lexer. 69 // They are never returned as a result of Lex(). 70 Ifdef, 71 Ifndef, 72 Else, 73 Endif, 74 Define, 75 76 // Reserved keywords. ('ElseKW' is named to distinguish it from the 77 // existing 'Else' that means the preprocessor #else.) 78 Bit, 79 Bits, 80 Code, 81 Dag, 82 ElseKW, 83 FalseKW, 84 Field, 85 In, 86 Include, 87 Int, 88 List, 89 String, 90 Then, 91 TrueKW, 92 93 // Object start tokens. 94 OBJECT_START_FIRST, 95 Assert = OBJECT_START_FIRST, 96 Class, 97 Def, 98 Defm, 99 Defset, 100 Defvar, 101 Dump, 102 Foreach, 103 If, 104 Let, 105 MultiClass, 106 OBJECT_START_LAST = MultiClass, 107 108 // Bang operators. 109 BANG_OPERATOR_FIRST, 110 XConcat = BANG_OPERATOR_FIRST, 111 XADD, 112 XSUB, 113 XMUL, 114 XDIV, 115 XNOT, 116 XLOG2, 117 XAND, 118 XOR, 119 XXOR, 120 XSRA, 121 XSRL, 122 XSHL, 123 XListConcat, 124 XListSplat, 125 XStrConcat, 126 XInterleave, 127 XSubstr, 128 XFind, 129 XCast, 130 XSubst, 131 XForEach, 132 XFilter, 133 XFoldl, 134 XHead, 135 XTail, 136 XSize, 137 XEmpty, 138 XIf, 139 XCond, 140 XEq, 141 XIsA, 142 XDag, 143 XNe, 144 XLe, 145 XLt, 146 XGe, 147 XGt, 148 XSetDagOp, 149 XGetDagOp, 150 XExists, 151 XListRemove, 152 XToLower, 153 XToUpper, 154 XRange, 155 XGetDagArg, 156 XGetDagName, 157 XSetDagArg, 158 XSetDagName, 159 XRepr, 160 BANG_OPERATOR_LAST = XRepr, 161 162 // String valued tokens. 163 STRING_VALUE_FIRST, 164 Id = STRING_VALUE_FIRST, 165 StrVal, 166 VarName, 167 CodeFragment, 168 STRING_VALUE_LAST = CodeFragment, 169 }; 170 171 /// isBangOperator - Return true if this is a bang operator. 172 static inline bool isBangOperator(tgtok::TokKind Kind) { 173 return tgtok::BANG_OPERATOR_FIRST <= Kind && Kind <= BANG_OPERATOR_LAST; 174 } 175 176 /// isObjectStart - Return true if this is a valid first token for a statement. 177 static inline bool isObjectStart(tgtok::TokKind Kind) { 178 return tgtok::OBJECT_START_FIRST <= Kind && Kind <= OBJECT_START_LAST; 179 } 180 181 /// isStringValue - Return true if this is a string value. 182 static inline bool isStringValue(tgtok::TokKind Kind) { 183 return tgtok::STRING_VALUE_FIRST <= Kind && Kind <= STRING_VALUE_LAST; 184 } 185 } // namespace tgtok 186 187 /// TGLexer - TableGen Lexer class. 188 class TGLexer { 189 SourceMgr &SrcMgr; 190 191 const char *CurPtr = nullptr; 192 StringRef CurBuf; 193 194 // Information about the current token. 195 const char *TokStart = nullptr; 196 tgtok::TokKind CurCode = tgtok::TokKind::Eof; 197 std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment 198 int64_t CurIntVal = 0; // This is valid for IntVal. 199 200 /// CurBuffer - This is the current buffer index we're lexing from as managed 201 /// by the SourceMgr object. 202 unsigned CurBuffer = 0; 203 204 public: 205 typedef std::set<std::string> DependenciesSetTy; 206 207 private: 208 /// Dependencies - This is the list of all included files. 209 DependenciesSetTy Dependencies; 210 211 public: 212 TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros); 213 214 tgtok::TokKind Lex() { 215 return CurCode = LexToken(CurPtr == CurBuf.begin()); 216 } 217 218 const DependenciesSetTy &getDependencies() const { 219 return Dependencies; 220 } 221 222 tgtok::TokKind getCode() const { return CurCode; } 223 224 const std::string &getCurStrVal() const { 225 assert(tgtok::isStringValue(CurCode) && 226 "This token doesn't have a string value"); 227 return CurStrVal; 228 } 229 int64_t getCurIntVal() const { 230 assert(CurCode == tgtok::IntVal && "This token isn't an integer"); 231 return CurIntVal; 232 } 233 std::pair<int64_t, unsigned> getCurBinaryIntVal() const { 234 assert(CurCode == tgtok::BinaryIntVal && 235 "This token isn't a binary integer"); 236 return std::make_pair(CurIntVal, (CurPtr - TokStart)-2); 237 } 238 239 SMLoc getLoc() const; 240 SMRange getLocRange() const; 241 242 private: 243 /// LexToken - Read the next token and return its code. 244 tgtok::TokKind LexToken(bool FileOrLineStart = false); 245 246 tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg); 247 tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg); 248 249 int getNextChar(); 250 int peekNextChar(int Index) const; 251 void SkipBCPLComment(); 252 bool SkipCComment(); 253 tgtok::TokKind LexIdentifier(); 254 bool LexInclude(); 255 tgtok::TokKind LexString(); 256 tgtok::TokKind LexVarName(); 257 tgtok::TokKind LexNumber(); 258 tgtok::TokKind LexBracket(); 259 tgtok::TokKind LexExclaim(); 260 261 // Process EOF encountered in LexToken(). 262 // If EOF is met in an include file, then the method will update 263 // CurPtr, CurBuf and preprocessing include stack, and return true. 264 // If EOF is met in the top-level file, then the method will 265 // update and check the preprocessing include stack, and return false. 266 bool processEOF(); 267 268 // *** Structures and methods for preprocessing support *** 269 270 // A set of macro names that are defined either via command line or 271 // by using: 272 // #define NAME 273 StringSet<> DefinedMacros; 274 275 // Each of #ifdef and #else directives has a descriptor associated 276 // with it. 277 // 278 // An ordered list of preprocessing controls defined by #ifdef/#else 279 // directives that are in effect currently is called preprocessing 280 // control stack. It is represented as a vector of PreprocessorControlDesc's. 281 // 282 // The control stack is updated according to the following rules: 283 // 284 // For each #ifdef we add an element to the control stack. 285 // For each #else we replace the top element with a descriptor 286 // with an inverted IsDefined value. 287 // For each #endif we pop the top element from the control stack. 288 // 289 // When CurPtr reaches the current buffer's end, the control stack 290 // must be empty, i.e. #ifdef and the corresponding #endif 291 // must be located in the same file. 292 struct PreprocessorControlDesc { 293 // Either tgtok::Ifdef or tgtok::Else. 294 tgtok::TokKind Kind; 295 296 // True, if the condition for this directive is true, false - otherwise. 297 // Examples: 298 // #ifdef NAME : true, if NAME is defined, false - otherwise. 299 // ... 300 // #else : false, if NAME is defined, true - otherwise. 301 bool IsDefined; 302 303 // Pointer into CurBuf to the beginning of the preprocessing directive 304 // word, e.g.: 305 // #ifdef NAME 306 // ^ - SrcPos 307 SMLoc SrcPos; 308 }; 309 310 // We want to disallow code like this: 311 // file1.td: 312 // #define NAME 313 // #ifdef NAME 314 // include "file2.td" 315 // EOF 316 // file2.td: 317 // #endif 318 // EOF 319 // 320 // To do this, we clear the preprocessing control stack on entry 321 // to each of the included file. PrepIncludeStack is used to store 322 // preprocessing control stacks for the current file and all its 323 // parent files. The back() element is the preprocessing control 324 // stack for the current file. 325 std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>> 326 PrepIncludeStack; 327 328 // Validate that the current preprocessing control stack is empty, 329 // since we are about to exit a file, and pop the include stack. 330 // 331 // If IncludeStackMustBeEmpty is true, the include stack must be empty 332 // after the popping, otherwise, the include stack must not be empty 333 // after the popping. Basically, the include stack must be empty 334 // only if we exit the "top-level" file (i.e. finish lexing). 335 // 336 // The method returns false, if the current preprocessing control stack 337 // is not empty (e.g. there is an unterminated #ifdef/#else), 338 // true - otherwise. 339 bool prepExitInclude(bool IncludeStackMustBeEmpty); 340 341 // Look ahead for a preprocessing directive starting from CurPtr. The caller 342 // must only call this method, if *(CurPtr - 1) is '#'. If the method matches 343 // a preprocessing directive word followed by a whitespace, then it returns 344 // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define. 345 // 346 // CurPtr is not adjusted by this method. 347 tgtok::TokKind prepIsDirective() const; 348 349 // Given a preprocessing token kind, adjusts CurPtr to the end 350 // of the preprocessing directive word. Returns true, unless 351 // an unsupported token kind is passed in. 352 // 353 // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective() 354 // to avoid adjusting CurPtr before we are sure that '#' is followed 355 // by a preprocessing directive. If it is not, then we fall back to 356 // tgtok::paste interpretation of '#'. 357 bool prepEatPreprocessorDirective(tgtok::TokKind Kind); 358 359 // The main "exit" point from the token parsing to preprocessor. 360 // 361 // The method is called for CurPtr, when prepIsDirective() returns 362 // true. The first parameter matches the result of prepIsDirective(), 363 // denoting the actual preprocessor directive to be processed. 364 // 365 // If the preprocessing directive disables the tokens processing, e.g.: 366 // #ifdef NAME // NAME is undefined 367 // then lexPreprocessor() enters the lines-skipping mode. 368 // In this mode, it does not parse any tokens, because the code under 369 // the #ifdef may not even be a correct tablegen code. The preprocessor 370 // looks for lines containing other preprocessing directives, which 371 // may be prepended with whitespaces and C-style comments. If the line 372 // does not contain a preprocessing directive, it is skipped completely. 373 // Otherwise, the preprocessing directive is processed by recursively 374 // calling lexPreprocessor(). The processing of the encountered 375 // preprocessing directives includes updating preprocessing control stack 376 // and adding new macros into DefinedMacros set. 377 // 378 // The second parameter controls whether lexPreprocessor() is called from 379 // LexToken() (true) or recursively from lexPreprocessor() (false). 380 // 381 // If ReturnNextLiveToken is true, the method returns the next 382 // LEX token following the current directive or following the end 383 // of the disabled preprocessing region corresponding to this directive. 384 // If ReturnNextLiveToken is false, the method returns the first parameter, 385 // unless there were errors encountered in the disabled preprocessing 386 // region - in this case, it returns tgtok::Error. 387 tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind, 388 bool ReturnNextLiveToken = true); 389 390 // Worker method for lexPreprocessor() to skip lines after some 391 // preprocessing directive up to the buffer end or to the directive 392 // that re-enables token processing. The method returns true 393 // upon processing the next directive that re-enables tokens 394 // processing. False is returned if an error was encountered. 395 // 396 // Note that prepSkipRegion() calls lexPreprocessor() to process 397 // encountered preprocessing directives. In this case, the second 398 // parameter to lexPreprocessor() is set to false. Being passed 399 // false ReturnNextLiveToken, lexPreprocessor() must never call 400 // prepSkipRegion(). We assert this by passing ReturnNextLiveToken 401 // to prepSkipRegion() and checking that it is never set to false. 402 bool prepSkipRegion(bool MustNeverBeFalse); 403 404 // Lex name of the macro after either #ifdef or #define. We could have used 405 // LexIdentifier(), but it has special handling of "include" word, which 406 // could result in awkward diagnostic errors. Consider: 407 // ---- 408 // #ifdef include 409 // class ... 410 // ---- 411 // LexIdentifier() will engage LexInclude(), which will complain about 412 // missing file with name "class". Instead, prepLexMacroName() will treat 413 // "include" as a normal macro name. 414 // 415 // On entry, CurPtr points to the end of a preprocessing directive word. 416 // The method allows for whitespaces between the preprocessing directive 417 // and the macro name. The allowed whitespaces are ' ' and '\t'. 418 // 419 // If the first non-whitespace symbol after the preprocessing directive 420 // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then 421 // the method updates TokStart to the position of the first non-whitespace 422 // symbol, sets CurPtr to the position of the macro name's last symbol, 423 // and returns a string reference to the macro name. Otherwise, 424 // TokStart is set to the first non-whitespace symbol after the preprocessing 425 // directive, and the method returns an empty string reference. 426 // 427 // In all cases, TokStart may be used to point to the word following 428 // the preprocessing directive. 429 StringRef prepLexMacroName(); 430 431 // Skip any whitespaces starting from CurPtr. The method is used 432 // only in the lines-skipping mode to find the first non-whitespace 433 // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n' 434 // and '\r'. The method skips C-style comments as well, because 435 // it is used to find the beginning of the preprocessing directive. 436 // If we do not handle C-style comments the following code would 437 // result in incorrect detection of a preprocessing directive: 438 // /* 439 // #ifdef NAME 440 // */ 441 // As long as we skip C-style comments, the following code is correctly 442 // recognized as a preprocessing directive: 443 // /* first line comment 444 // second line comment */ #ifdef NAME 445 // 446 // The method returns true upon reaching the first non-whitespace symbol 447 // or EOF, CurPtr is set to point to this symbol. The method returns false, 448 // if an error occurred during skipping of a C-style comment. 449 bool prepSkipLineBegin(); 450 451 // Skip any whitespaces or comments after a preprocessing directive. 452 // The method returns true upon reaching either end of the line 453 // or end of the file. If there is a multiline C-style comment 454 // after the preprocessing directive, the method skips 455 // the comment, so the final CurPtr may point to one of the next lines. 456 // The method returns false, if an error occurred during skipping 457 // C- or C++-style comment, or a non-whitespace symbol appears 458 // after the preprocessing directive. 459 // 460 // The method maybe called both during lines-skipping and tokens 461 // processing. It actually verifies that only whitespaces or/and 462 // comments follow a preprocessing directive. 463 // 464 // After the execution of this mehod, CurPtr points either to new line 465 // symbol, buffer end or non-whitespace symbol following the preprocesing 466 // directive. 467 bool prepSkipDirectiveEnd(); 468 469 // Skip all symbols to the end of the line/file. 470 // The method adjusts CurPtr, so that it points to either new line 471 // symbol in the current line or the buffer end. 472 void prepSkipToLineEnd(); 473 474 // Return true, if the current preprocessor control stack is such that 475 // we should allow lexer to process the next token, false - otherwise. 476 // 477 // In particular, the method returns true, if all the #ifdef/#else 478 // controls on the stack have their IsDefined member set to true. 479 bool prepIsProcessingEnabled(); 480 481 // Report an error, if we reach EOF with non-empty preprocessing control 482 // stack. This means there is no matching #endif for the previous 483 // #ifdef/#else. 484 void prepReportPreprocessorStackError(); 485 }; 486 487 } // end namespace llvm 488 489 #endif 490