1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This class represents the Lexer for tablegen files. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H 14 #define LLVM_LIB_TABLEGEN_TGLEXER_H 15 16 #include "llvm/ADT/StringRef.h" 17 #include "llvm/ADT/StringSet.h" 18 #include "llvm/Support/DataTypes.h" 19 #include "llvm/Support/SMLoc.h" 20 #include <cassert> 21 #include <memory> 22 #include <set> 23 #include <string> 24 #include <vector> 25 26 namespace llvm { 27 template <typename T> class ArrayRef; 28 class SourceMgr; 29 class Twine; 30 31 namespace tgtok { 32 enum TokKind { 33 // Markers 34 Eof, 35 Error, 36 37 // Tokens with no info. 38 minus, // - 39 plus, // + 40 l_square, // [ 41 r_square, // ] 42 l_brace, // { 43 r_brace, // } 44 l_paren, // ( 45 r_paren, // ) 46 less, // < 47 greater, // > 48 colon, // : 49 semi, // ; 50 comma, // , 51 dot, // . 52 equal, // = 53 question, // ? 54 paste, // # 55 dotdotdot, // ... 56 57 // Boolean literals. 58 TrueVal, 59 FalseVal, 60 61 // Integer value. 62 IntVal, 63 64 // Binary constant. Note that these are sized according to the number of 65 // bits given. 66 BinaryIntVal, 67 68 // Preprocessing tokens for internal usage by the lexer. 69 // They are never returned as a result of Lex(). 70 Ifdef, 71 Ifndef, 72 Else, 73 Endif, 74 Define, 75 76 // Reserved keywords. ('ElseKW' is named to distinguish it from the 77 // existing 'Else' that means the preprocessor #else.) 78 Bit, 79 Bits, 80 Code, 81 Dag, 82 ElseKW, 83 FalseKW, 84 Field, 85 In, 86 Include, 87 Int, 88 List, 89 String, 90 Then, 91 TrueKW, 92 93 // Object start tokens. 94 OBJECT_START_FIRST, 95 Assert = OBJECT_START_FIRST, 96 Class, 97 Def, 98 Defm, 99 Defset, 100 Deftype, 101 Defvar, 102 Dump, 103 Foreach, 104 If, 105 Let, 106 MultiClass, 107 OBJECT_START_LAST = MultiClass, 108 109 // Bang operators. 110 BANG_OPERATOR_FIRST, 111 XConcat = BANG_OPERATOR_FIRST, 112 XADD, 113 XSUB, 114 XMUL, 115 XDIV, 116 XNOT, 117 XLOG2, 118 XAND, 119 XOR, 120 XXOR, 121 XSRA, 122 XSRL, 123 XSHL, 124 XListConcat, 125 XListSplat, 126 XStrConcat, 127 XInterleave, 128 XSubstr, 129 XFind, 130 XCast, 131 XSubst, 132 XForEach, 133 XFilter, 134 XFoldl, 135 XHead, 136 XTail, 137 XSize, 138 XEmpty, 139 XIf, 140 XCond, 141 XEq, 142 XIsA, 143 XDag, 144 XNe, 145 XLe, 146 XLt, 147 XGe, 148 XGt, 149 XSetDagOp, 150 XGetDagOp, 151 XExists, 152 XListRemove, 153 XToLower, 154 XToUpper, 155 XRange, 156 XGetDagArg, 157 XGetDagName, 158 XSetDagArg, 159 XSetDagName, 160 XRepr, 161 BANG_OPERATOR_LAST = XRepr, 162 163 // String valued tokens. 164 STRING_VALUE_FIRST, 165 Id = STRING_VALUE_FIRST, 166 StrVal, 167 VarName, 168 CodeFragment, 169 STRING_VALUE_LAST = CodeFragment, 170 }; 171 172 /// isBangOperator - Return true if this is a bang operator. 173 static inline bool isBangOperator(tgtok::TokKind Kind) { 174 return tgtok::BANG_OPERATOR_FIRST <= Kind && Kind <= BANG_OPERATOR_LAST; 175 } 176 177 /// isObjectStart - Return true if this is a valid first token for a statement. 178 static inline bool isObjectStart(tgtok::TokKind Kind) { 179 return tgtok::OBJECT_START_FIRST <= Kind && Kind <= OBJECT_START_LAST; 180 } 181 182 /// isStringValue - Return true if this is a string value. 183 static inline bool isStringValue(tgtok::TokKind Kind) { 184 return tgtok::STRING_VALUE_FIRST <= Kind && Kind <= STRING_VALUE_LAST; 185 } 186 } // namespace tgtok 187 188 /// TGLexer - TableGen Lexer class. 189 class TGLexer { 190 SourceMgr &SrcMgr; 191 192 const char *CurPtr = nullptr; 193 StringRef CurBuf; 194 195 // Information about the current token. 196 const char *TokStart = nullptr; 197 tgtok::TokKind CurCode = tgtok::TokKind::Eof; 198 std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment 199 int64_t CurIntVal = 0; // This is valid for IntVal. 200 201 /// CurBuffer - This is the current buffer index we're lexing from as managed 202 /// by the SourceMgr object. 203 unsigned CurBuffer = 0; 204 205 public: 206 typedef std::set<std::string> DependenciesSetTy; 207 208 private: 209 /// Dependencies - This is the list of all included files. 210 DependenciesSetTy Dependencies; 211 212 public: 213 TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros); 214 215 tgtok::TokKind Lex() { 216 return CurCode = LexToken(CurPtr == CurBuf.begin()); 217 } 218 219 const DependenciesSetTy &getDependencies() const { 220 return Dependencies; 221 } 222 223 tgtok::TokKind getCode() const { return CurCode; } 224 225 const std::string &getCurStrVal() const { 226 assert(tgtok::isStringValue(CurCode) && 227 "This token doesn't have a string value"); 228 return CurStrVal; 229 } 230 int64_t getCurIntVal() const { 231 assert(CurCode == tgtok::IntVal && "This token isn't an integer"); 232 return CurIntVal; 233 } 234 std::pair<int64_t, unsigned> getCurBinaryIntVal() const { 235 assert(CurCode == tgtok::BinaryIntVal && 236 "This token isn't a binary integer"); 237 return std::make_pair(CurIntVal, (CurPtr - TokStart)-2); 238 } 239 240 SMLoc getLoc() const; 241 SMRange getLocRange() const; 242 243 private: 244 /// LexToken - Read the next token and return its code. 245 tgtok::TokKind LexToken(bool FileOrLineStart = false); 246 247 tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg); 248 tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg); 249 250 int getNextChar(); 251 int peekNextChar(int Index) const; 252 void SkipBCPLComment(); 253 bool SkipCComment(); 254 tgtok::TokKind LexIdentifier(); 255 bool LexInclude(); 256 tgtok::TokKind LexString(); 257 tgtok::TokKind LexVarName(); 258 tgtok::TokKind LexNumber(); 259 tgtok::TokKind LexBracket(); 260 tgtok::TokKind LexExclaim(); 261 262 // Process EOF encountered in LexToken(). 263 // If EOF is met in an include file, then the method will update 264 // CurPtr, CurBuf and preprocessing include stack, and return true. 265 // If EOF is met in the top-level file, then the method will 266 // update and check the preprocessing include stack, and return false. 267 bool processEOF(); 268 269 // *** Structures and methods for preprocessing support *** 270 271 // A set of macro names that are defined either via command line or 272 // by using: 273 // #define NAME 274 StringSet<> DefinedMacros; 275 276 // Each of #ifdef and #else directives has a descriptor associated 277 // with it. 278 // 279 // An ordered list of preprocessing controls defined by #ifdef/#else 280 // directives that are in effect currently is called preprocessing 281 // control stack. It is represented as a vector of PreprocessorControlDesc's. 282 // 283 // The control stack is updated according to the following rules: 284 // 285 // For each #ifdef we add an element to the control stack. 286 // For each #else we replace the top element with a descriptor 287 // with an inverted IsDefined value. 288 // For each #endif we pop the top element from the control stack. 289 // 290 // When CurPtr reaches the current buffer's end, the control stack 291 // must be empty, i.e. #ifdef and the corresponding #endif 292 // must be located in the same file. 293 struct PreprocessorControlDesc { 294 // Either tgtok::Ifdef or tgtok::Else. 295 tgtok::TokKind Kind; 296 297 // True, if the condition for this directive is true, false - otherwise. 298 // Examples: 299 // #ifdef NAME : true, if NAME is defined, false - otherwise. 300 // ... 301 // #else : false, if NAME is defined, true - otherwise. 302 bool IsDefined; 303 304 // Pointer into CurBuf to the beginning of the preprocessing directive 305 // word, e.g.: 306 // #ifdef NAME 307 // ^ - SrcPos 308 SMLoc SrcPos; 309 }; 310 311 // We want to disallow code like this: 312 // file1.td: 313 // #define NAME 314 // #ifdef NAME 315 // include "file2.td" 316 // EOF 317 // file2.td: 318 // #endif 319 // EOF 320 // 321 // To do this, we clear the preprocessing control stack on entry 322 // to each of the included file. PrepIncludeStack is used to store 323 // preprocessing control stacks for the current file and all its 324 // parent files. The back() element is the preprocessing control 325 // stack for the current file. 326 std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>> 327 PrepIncludeStack; 328 329 // Validate that the current preprocessing control stack is empty, 330 // since we are about to exit a file, and pop the include stack. 331 // 332 // If IncludeStackMustBeEmpty is true, the include stack must be empty 333 // after the popping, otherwise, the include stack must not be empty 334 // after the popping. Basically, the include stack must be empty 335 // only if we exit the "top-level" file (i.e. finish lexing). 336 // 337 // The method returns false, if the current preprocessing control stack 338 // is not empty (e.g. there is an unterminated #ifdef/#else), 339 // true - otherwise. 340 bool prepExitInclude(bool IncludeStackMustBeEmpty); 341 342 // Look ahead for a preprocessing directive starting from CurPtr. The caller 343 // must only call this method, if *(CurPtr - 1) is '#'. If the method matches 344 // a preprocessing directive word followed by a whitespace, then it returns 345 // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define. 346 // 347 // CurPtr is not adjusted by this method. 348 tgtok::TokKind prepIsDirective() const; 349 350 // Given a preprocessing token kind, adjusts CurPtr to the end 351 // of the preprocessing directive word. Returns true, unless 352 // an unsupported token kind is passed in. 353 // 354 // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective() 355 // to avoid adjusting CurPtr before we are sure that '#' is followed 356 // by a preprocessing directive. If it is not, then we fall back to 357 // tgtok::paste interpretation of '#'. 358 bool prepEatPreprocessorDirective(tgtok::TokKind Kind); 359 360 // The main "exit" point from the token parsing to preprocessor. 361 // 362 // The method is called for CurPtr, when prepIsDirective() returns 363 // true. The first parameter matches the result of prepIsDirective(), 364 // denoting the actual preprocessor directive to be processed. 365 // 366 // If the preprocessing directive disables the tokens processing, e.g.: 367 // #ifdef NAME // NAME is undefined 368 // then lexPreprocessor() enters the lines-skipping mode. 369 // In this mode, it does not parse any tokens, because the code under 370 // the #ifdef may not even be a correct tablegen code. The preprocessor 371 // looks for lines containing other preprocessing directives, which 372 // may be prepended with whitespaces and C-style comments. If the line 373 // does not contain a preprocessing directive, it is skipped completely. 374 // Otherwise, the preprocessing directive is processed by recursively 375 // calling lexPreprocessor(). The processing of the encountered 376 // preprocessing directives includes updating preprocessing control stack 377 // and adding new macros into DefinedMacros set. 378 // 379 // The second parameter controls whether lexPreprocessor() is called from 380 // LexToken() (true) or recursively from lexPreprocessor() (false). 381 // 382 // If ReturnNextLiveToken is true, the method returns the next 383 // LEX token following the current directive or following the end 384 // of the disabled preprocessing region corresponding to this directive. 385 // If ReturnNextLiveToken is false, the method returns the first parameter, 386 // unless there were errors encountered in the disabled preprocessing 387 // region - in this case, it returns tgtok::Error. 388 tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind, 389 bool ReturnNextLiveToken = true); 390 391 // Worker method for lexPreprocessor() to skip lines after some 392 // preprocessing directive up to the buffer end or to the directive 393 // that re-enables token processing. The method returns true 394 // upon processing the next directive that re-enables tokens 395 // processing. False is returned if an error was encountered. 396 // 397 // Note that prepSkipRegion() calls lexPreprocessor() to process 398 // encountered preprocessing directives. In this case, the second 399 // parameter to lexPreprocessor() is set to false. Being passed 400 // false ReturnNextLiveToken, lexPreprocessor() must never call 401 // prepSkipRegion(). We assert this by passing ReturnNextLiveToken 402 // to prepSkipRegion() and checking that it is never set to false. 403 bool prepSkipRegion(bool MustNeverBeFalse); 404 405 // Lex name of the macro after either #ifdef or #define. We could have used 406 // LexIdentifier(), but it has special handling of "include" word, which 407 // could result in awkward diagnostic errors. Consider: 408 // ---- 409 // #ifdef include 410 // class ... 411 // ---- 412 // LexIdentifier() will engage LexInclude(), which will complain about 413 // missing file with name "class". Instead, prepLexMacroName() will treat 414 // "include" as a normal macro name. 415 // 416 // On entry, CurPtr points to the end of a preprocessing directive word. 417 // The method allows for whitespaces between the preprocessing directive 418 // and the macro name. The allowed whitespaces are ' ' and '\t'. 419 // 420 // If the first non-whitespace symbol after the preprocessing directive 421 // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then 422 // the method updates TokStart to the position of the first non-whitespace 423 // symbol, sets CurPtr to the position of the macro name's last symbol, 424 // and returns a string reference to the macro name. Otherwise, 425 // TokStart is set to the first non-whitespace symbol after the preprocessing 426 // directive, and the method returns an empty string reference. 427 // 428 // In all cases, TokStart may be used to point to the word following 429 // the preprocessing directive. 430 StringRef prepLexMacroName(); 431 432 // Skip any whitespaces starting from CurPtr. The method is used 433 // only in the lines-skipping mode to find the first non-whitespace 434 // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n' 435 // and '\r'. The method skips C-style comments as well, because 436 // it is used to find the beginning of the preprocessing directive. 437 // If we do not handle C-style comments the following code would 438 // result in incorrect detection of a preprocessing directive: 439 // /* 440 // #ifdef NAME 441 // */ 442 // As long as we skip C-style comments, the following code is correctly 443 // recognized as a preprocessing directive: 444 // /* first line comment 445 // second line comment */ #ifdef NAME 446 // 447 // The method returns true upon reaching the first non-whitespace symbol 448 // or EOF, CurPtr is set to point to this symbol. The method returns false, 449 // if an error occurred during skipping of a C-style comment. 450 bool prepSkipLineBegin(); 451 452 // Skip any whitespaces or comments after a preprocessing directive. 453 // The method returns true upon reaching either end of the line 454 // or end of the file. If there is a multiline C-style comment 455 // after the preprocessing directive, the method skips 456 // the comment, so the final CurPtr may point to one of the next lines. 457 // The method returns false, if an error occurred during skipping 458 // C- or C++-style comment, or a non-whitespace symbol appears 459 // after the preprocessing directive. 460 // 461 // The method maybe called both during lines-skipping and tokens 462 // processing. It actually verifies that only whitespaces or/and 463 // comments follow a preprocessing directive. 464 // 465 // After the execution of this mehod, CurPtr points either to new line 466 // symbol, buffer end or non-whitespace symbol following the preprocesing 467 // directive. 468 bool prepSkipDirectiveEnd(); 469 470 // Return true, if the current preprocessor control stack is such that 471 // we should allow lexer to process the next token, false - otherwise. 472 // 473 // In particular, the method returns true, if all the #ifdef/#else 474 // controls on the stack have their IsDefined member set to true. 475 bool prepIsProcessingEnabled(); 476 477 // Report an error, if we reach EOF with non-empty preprocessing control 478 // stack. This means there is no matching #endif for the previous 479 // #ifdef/#else. 480 void prepReportPreprocessorStackError(); 481 }; 482 483 } // end namespace llvm 484 485 #endif 486