1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This class represents the Lexer for tablegen files. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H 14 #define LLVM_LIB_TABLEGEN_TGLEXER_H 15 16 #include "llvm/ADT/StringRef.h" 17 #include "llvm/ADT/StringSet.h" 18 #include "llvm/Support/DataTypes.h" 19 #include "llvm/Support/SMLoc.h" 20 #include <cassert> 21 #include <memory> 22 #include <set> 23 #include <string> 24 #include <vector> 25 26 namespace llvm { 27 template <typename T> class ArrayRef; 28 class SourceMgr; 29 class Twine; 30 31 namespace tgtok { 32 enum TokKind { 33 // Markers 34 Eof, 35 Error, 36 37 // Tokens with no info. 38 minus, // - 39 plus, // + 40 l_square, // [ 41 r_square, // ] 42 l_brace, // { 43 r_brace, // } 44 l_paren, // ( 45 r_paren, // ) 46 less, // < 47 greater, // > 48 colon, // : 49 semi, // ; 50 comma, // , 51 dot, // . 52 equal, // = 53 question, // ? 54 paste, // # 55 dotdotdot, // ... 56 57 // Reserved keywords. ('ElseKW' is named to distinguish it from the 58 // existing 'Else' that means the preprocessor #else.) 59 Assert, 60 Bit, 61 Bits, 62 Class, 63 Code, 64 Dag, 65 Def, 66 Defm, 67 Defset, 68 Defvar, 69 ElseKW, 70 FalseKW, 71 Field, 72 Foreach, 73 If, 74 In, 75 Include, 76 Int, 77 Let, 78 List, 79 MultiClass, 80 String, 81 Then, 82 TrueKW, 83 84 // Bang operators. 85 XConcat, 86 XADD, 87 XSUB, 88 XMUL, 89 XDIV, 90 XNOT, 91 XLOG2, 92 XAND, 93 XOR, 94 XXOR, 95 XSRA, 96 XSRL, 97 XSHL, 98 XListConcat, 99 XListSplat, 100 XStrConcat, 101 XInterleave, 102 XSubstr, 103 XFind, 104 XCast, 105 XSubst, 106 XForEach, 107 XFilter, 108 XFoldl, 109 XHead, 110 XTail, 111 XSize, 112 XEmpty, 113 XIf, 114 XCond, 115 XEq, 116 XIsA, 117 XDag, 118 XNe, 119 XLe, 120 XLt, 121 XGe, 122 XGt, 123 XSetDagOp, 124 XGetDagOp, 125 XExists, 126 XListRemove, 127 XToLower, 128 XToUpper, 129 XRange, 130 XGetDagArg, 131 XGetDagName, 132 XSetDagArg, 133 XSetDagName, 134 135 // Boolean literals. 136 TrueVal, 137 FalseVal, 138 139 // Integer value. 140 IntVal, 141 142 // Binary constant. Note that these are sized according to the number of 143 // bits given. 144 BinaryIntVal, 145 146 // String valued tokens. 147 Id, 148 StrVal, 149 VarName, 150 CodeFragment, 151 152 // Preprocessing tokens for internal usage by the lexer. 153 // They are never returned as a result of Lex(). 154 Ifdef, 155 Ifndef, 156 Else, 157 Endif, 158 Define 159 }; 160 } 161 162 /// TGLexer - TableGen Lexer class. 163 class TGLexer { 164 SourceMgr &SrcMgr; 165 166 const char *CurPtr = nullptr; 167 StringRef CurBuf; 168 169 // Information about the current token. 170 const char *TokStart = nullptr; 171 tgtok::TokKind CurCode = tgtok::TokKind::Eof; 172 std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment 173 int64_t CurIntVal = 0; // This is valid for IntVal. 174 175 /// CurBuffer - This is the current buffer index we're lexing from as managed 176 /// by the SourceMgr object. 177 unsigned CurBuffer = 0; 178 179 public: 180 typedef std::set<std::string> DependenciesSetTy; 181 182 private: 183 /// Dependencies - This is the list of all included files. 184 DependenciesSetTy Dependencies; 185 186 public: 187 TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros); 188 189 tgtok::TokKind Lex() { 190 return CurCode = LexToken(CurPtr == CurBuf.begin()); 191 } 192 193 const DependenciesSetTy &getDependencies() const { 194 return Dependencies; 195 } 196 197 tgtok::TokKind getCode() const { return CurCode; } 198 199 const std::string &getCurStrVal() const { 200 assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal || 201 CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) && 202 "This token doesn't have a string value"); 203 return CurStrVal; 204 } 205 int64_t getCurIntVal() const { 206 assert(CurCode == tgtok::IntVal && "This token isn't an integer"); 207 return CurIntVal; 208 } 209 std::pair<int64_t, unsigned> getCurBinaryIntVal() const { 210 assert(CurCode == tgtok::BinaryIntVal && 211 "This token isn't a binary integer"); 212 return std::make_pair(CurIntVal, (CurPtr - TokStart)-2); 213 } 214 215 SMLoc getLoc() const; 216 SMRange getLocRange() const; 217 218 private: 219 /// LexToken - Read the next token and return its code. 220 tgtok::TokKind LexToken(bool FileOrLineStart = false); 221 222 tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg); 223 tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg); 224 225 int getNextChar(); 226 int peekNextChar(int Index) const; 227 void SkipBCPLComment(); 228 bool SkipCComment(); 229 tgtok::TokKind LexIdentifier(); 230 bool LexInclude(); 231 tgtok::TokKind LexString(); 232 tgtok::TokKind LexVarName(); 233 tgtok::TokKind LexNumber(); 234 tgtok::TokKind LexBracket(); 235 tgtok::TokKind LexExclaim(); 236 237 // Process EOF encountered in LexToken(). 238 // If EOF is met in an include file, then the method will update 239 // CurPtr, CurBuf and preprocessing include stack, and return true. 240 // If EOF is met in the top-level file, then the method will 241 // update and check the preprocessing include stack, and return false. 242 bool processEOF(); 243 244 // *** Structures and methods for preprocessing support *** 245 246 // A set of macro names that are defined either via command line or 247 // by using: 248 // #define NAME 249 StringSet<> DefinedMacros; 250 251 // Each of #ifdef and #else directives has a descriptor associated 252 // with it. 253 // 254 // An ordered list of preprocessing controls defined by #ifdef/#else 255 // directives that are in effect currently is called preprocessing 256 // control stack. It is represented as a vector of PreprocessorControlDesc's. 257 // 258 // The control stack is updated according to the following rules: 259 // 260 // For each #ifdef we add an element to the control stack. 261 // For each #else we replace the top element with a descriptor 262 // with an inverted IsDefined value. 263 // For each #endif we pop the top element from the control stack. 264 // 265 // When CurPtr reaches the current buffer's end, the control stack 266 // must be empty, i.e. #ifdef and the corresponding #endif 267 // must be located in the same file. 268 struct PreprocessorControlDesc { 269 // Either tgtok::Ifdef or tgtok::Else. 270 tgtok::TokKind Kind; 271 272 // True, if the condition for this directive is true, false - otherwise. 273 // Examples: 274 // #ifdef NAME : true, if NAME is defined, false - otherwise. 275 // ... 276 // #else : false, if NAME is defined, true - otherwise. 277 bool IsDefined; 278 279 // Pointer into CurBuf to the beginning of the preprocessing directive 280 // word, e.g.: 281 // #ifdef NAME 282 // ^ - SrcPos 283 SMLoc SrcPos; 284 }; 285 286 // We want to disallow code like this: 287 // file1.td: 288 // #define NAME 289 // #ifdef NAME 290 // include "file2.td" 291 // EOF 292 // file2.td: 293 // #endif 294 // EOF 295 // 296 // To do this, we clear the preprocessing control stack on entry 297 // to each of the included file. PrepIncludeStack is used to store 298 // preprocessing control stacks for the current file and all its 299 // parent files. The back() element is the preprocessing control 300 // stack for the current file. 301 std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>> 302 PrepIncludeStack; 303 304 // Validate that the current preprocessing control stack is empty, 305 // since we are about to exit a file, and pop the include stack. 306 // 307 // If IncludeStackMustBeEmpty is true, the include stack must be empty 308 // after the popping, otherwise, the include stack must not be empty 309 // after the popping. Basically, the include stack must be empty 310 // only if we exit the "top-level" file (i.e. finish lexing). 311 // 312 // The method returns false, if the current preprocessing control stack 313 // is not empty (e.g. there is an unterminated #ifdef/#else), 314 // true - otherwise. 315 bool prepExitInclude(bool IncludeStackMustBeEmpty); 316 317 // Look ahead for a preprocessing directive starting from CurPtr. The caller 318 // must only call this method, if *(CurPtr - 1) is '#'. If the method matches 319 // a preprocessing directive word followed by a whitespace, then it returns 320 // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define. 321 // 322 // CurPtr is not adjusted by this method. 323 tgtok::TokKind prepIsDirective() const; 324 325 // Given a preprocessing token kind, adjusts CurPtr to the end 326 // of the preprocessing directive word. Returns true, unless 327 // an unsupported token kind is passed in. 328 // 329 // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective() 330 // to avoid adjusting CurPtr before we are sure that '#' is followed 331 // by a preprocessing directive. If it is not, then we fall back to 332 // tgtok::paste interpretation of '#'. 333 bool prepEatPreprocessorDirective(tgtok::TokKind Kind); 334 335 // The main "exit" point from the token parsing to preprocessor. 336 // 337 // The method is called for CurPtr, when prepIsDirective() returns 338 // true. The first parameter matches the result of prepIsDirective(), 339 // denoting the actual preprocessor directive to be processed. 340 // 341 // If the preprocessing directive disables the tokens processing, e.g.: 342 // #ifdef NAME // NAME is undefined 343 // then lexPreprocessor() enters the lines-skipping mode. 344 // In this mode, it does not parse any tokens, because the code under 345 // the #ifdef may not even be a correct tablegen code. The preprocessor 346 // looks for lines containing other preprocessing directives, which 347 // may be prepended with whitespaces and C-style comments. If the line 348 // does not contain a preprocessing directive, it is skipped completely. 349 // Otherwise, the preprocessing directive is processed by recursively 350 // calling lexPreprocessor(). The processing of the encountered 351 // preprocessing directives includes updating preprocessing control stack 352 // and adding new macros into DefinedMacros set. 353 // 354 // The second parameter controls whether lexPreprocessor() is called from 355 // LexToken() (true) or recursively from lexPreprocessor() (false). 356 // 357 // If ReturnNextLiveToken is true, the method returns the next 358 // LEX token following the current directive or following the end 359 // of the disabled preprocessing region corresponding to this directive. 360 // If ReturnNextLiveToken is false, the method returns the first parameter, 361 // unless there were errors encountered in the disabled preprocessing 362 // region - in this case, it returns tgtok::Error. 363 tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind, 364 bool ReturnNextLiveToken = true); 365 366 // Worker method for lexPreprocessor() to skip lines after some 367 // preprocessing directive up to the buffer end or to the directive 368 // that re-enables token processing. The method returns true 369 // upon processing the next directive that re-enables tokens 370 // processing. False is returned if an error was encountered. 371 // 372 // Note that prepSkipRegion() calls lexPreprocessor() to process 373 // encountered preprocessing directives. In this case, the second 374 // parameter to lexPreprocessor() is set to false. Being passed 375 // false ReturnNextLiveToken, lexPreprocessor() must never call 376 // prepSkipRegion(). We assert this by passing ReturnNextLiveToken 377 // to prepSkipRegion() and checking that it is never set to false. 378 bool prepSkipRegion(bool MustNeverBeFalse); 379 380 // Lex name of the macro after either #ifdef or #define. We could have used 381 // LexIdentifier(), but it has special handling of "include" word, which 382 // could result in awkward diagnostic errors. Consider: 383 // ---- 384 // #ifdef include 385 // class ... 386 // ---- 387 // LexIdentifier() will engage LexInclude(), which will complain about 388 // missing file with name "class". Instead, prepLexMacroName() will treat 389 // "include" as a normal macro name. 390 // 391 // On entry, CurPtr points to the end of a preprocessing directive word. 392 // The method allows for whitespaces between the preprocessing directive 393 // and the macro name. The allowed whitespaces are ' ' and '\t'. 394 // 395 // If the first non-whitespace symbol after the preprocessing directive 396 // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then 397 // the method updates TokStart to the position of the first non-whitespace 398 // symbol, sets CurPtr to the position of the macro name's last symbol, 399 // and returns a string reference to the macro name. Otherwise, 400 // TokStart is set to the first non-whitespace symbol after the preprocessing 401 // directive, and the method returns an empty string reference. 402 // 403 // In all cases, TokStart may be used to point to the word following 404 // the preprocessing directive. 405 StringRef prepLexMacroName(); 406 407 // Skip any whitespaces starting from CurPtr. The method is used 408 // only in the lines-skipping mode to find the first non-whitespace 409 // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n' 410 // and '\r'. The method skips C-style comments as well, because 411 // it is used to find the beginning of the preprocessing directive. 412 // If we do not handle C-style comments the following code would 413 // result in incorrect detection of a preprocessing directive: 414 // /* 415 // #ifdef NAME 416 // */ 417 // As long as we skip C-style comments, the following code is correctly 418 // recognized as a preprocessing directive: 419 // /* first line comment 420 // second line comment */ #ifdef NAME 421 // 422 // The method returns true upon reaching the first non-whitespace symbol 423 // or EOF, CurPtr is set to point to this symbol. The method returns false, 424 // if an error occurred during skipping of a C-style comment. 425 bool prepSkipLineBegin(); 426 427 // Skip any whitespaces or comments after a preprocessing directive. 428 // The method returns true upon reaching either end of the line 429 // or end of the file. If there is a multiline C-style comment 430 // after the preprocessing directive, the method skips 431 // the comment, so the final CurPtr may point to one of the next lines. 432 // The method returns false, if an error occurred during skipping 433 // C- or C++-style comment, or a non-whitespace symbol appears 434 // after the preprocessing directive. 435 // 436 // The method maybe called both during lines-skipping and tokens 437 // processing. It actually verifies that only whitespaces or/and 438 // comments follow a preprocessing directive. 439 // 440 // After the execution of this mehod, CurPtr points either to new line 441 // symbol, buffer end or non-whitespace symbol following the preprocesing 442 // directive. 443 bool prepSkipDirectiveEnd(); 444 445 // Skip all symbols to the end of the line/file. 446 // The method adjusts CurPtr, so that it points to either new line 447 // symbol in the current line or the buffer end. 448 void prepSkipToLineEnd(); 449 450 // Return true, if the current preprocessor control stack is such that 451 // we should allow lexer to process the next token, false - otherwise. 452 // 453 // In particular, the method returns true, if all the #ifdef/#else 454 // controls on the stack have their IsDefined member set to true. 455 bool prepIsProcessingEnabled(); 456 457 // Report an error, if we reach EOF with non-empty preprocessing control 458 // stack. This means there is no matching #endif for the previous 459 // #ifdef/#else. 460 void prepReportPreprocessorStackError(); 461 }; 462 463 } // end namespace llvm 464 465 #endif 466