1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This class represents the Lexer for tablegen files. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H 14 #define LLVM_LIB_TABLEGEN_TGLEXER_H 15 16 #include "llvm/ADT/StringRef.h" 17 #include "llvm/ADT/StringSet.h" 18 #include "llvm/Support/DataTypes.h" 19 #include "llvm/Support/SMLoc.h" 20 #include <cassert> 21 #include <memory> 22 #include <set> 23 #include <string> 24 #include <vector> 25 26 namespace llvm { 27 template <typename T> class ArrayRef; 28 class SourceMgr; 29 class Twine; 30 31 namespace tgtok { 32 enum TokKind { 33 // Markers 34 Eof, Error, 35 36 // Tokens with no info. 37 minus, plus, // - + 38 l_square, r_square, // [ ] 39 l_brace, r_brace, // { } 40 l_paren, r_paren, // ( ) 41 less, greater, // < > 42 colon, semi, // : ; 43 comma, dot, // , . 44 equal, question, // = ? 45 paste, // # 46 dotdotdot, // ... 47 48 // Reserved keywords. ('ElseKW' is named to distinguish it from the 49 // existing 'Else' that means the preprocessor #else.) 50 Assert, Bit, Bits, Class, Code, Dag, Def, Defm, Defset, Defvar, ElseKW, 51 FalseKW, Field, Foreach, If, In, Include, Int, Let, List, MultiClass, 52 String, Then, TrueKW, 53 54 // Bang operators. 55 XConcat, XADD, XSUB, XMUL, XDIV, XNOT, XLOG2, XAND, XOR, XXOR, XSRA, XSRL, 56 XSHL, XListConcat, XListSplat, XStrConcat, XInterleave, XSubstr, XFind, 57 XCast, XSubst, XForEach, XFilter, XFoldl, XHead, XTail, XSize, XEmpty, XIf, 58 XCond, XEq, XIsA, XDag, XNe, XLe, XLt, XGe, XGt, XSetDagOp, XGetDagOp, 59 XExists, XListRemove, 60 61 // Boolean literals. 62 TrueVal, FalseVal, 63 64 // Integer value. 65 IntVal, 66 67 // Binary constant. Note that these are sized according to the number of 68 // bits given. 69 BinaryIntVal, 70 71 // String valued tokens. 72 Id, StrVal, VarName, CodeFragment, 73 74 // Preprocessing tokens for internal usage by the lexer. 75 // They are never returned as a result of Lex(). 76 Ifdef, Ifndef, Else, Endif, Define 77 }; 78 } 79 80 /// TGLexer - TableGen Lexer class. 81 class TGLexer { 82 SourceMgr &SrcMgr; 83 84 const char *CurPtr = nullptr; 85 StringRef CurBuf; 86 87 // Information about the current token. 88 const char *TokStart = nullptr; 89 tgtok::TokKind CurCode = tgtok::TokKind::Eof; 90 std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment 91 int64_t CurIntVal = 0; // This is valid for IntVal. 92 93 /// CurBuffer - This is the current buffer index we're lexing from as managed 94 /// by the SourceMgr object. 95 unsigned CurBuffer = 0; 96 97 public: 98 typedef std::set<std::string> DependenciesSetTy; 99 100 private: 101 /// Dependencies - This is the list of all included files. 102 DependenciesSetTy Dependencies; 103 104 public: 105 TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros); 106 107 tgtok::TokKind Lex() { 108 return CurCode = LexToken(CurPtr == CurBuf.begin()); 109 } 110 111 const DependenciesSetTy &getDependencies() const { 112 return Dependencies; 113 } 114 115 tgtok::TokKind getCode() const { return CurCode; } 116 117 const std::string &getCurStrVal() const { 118 assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal || 119 CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) && 120 "This token doesn't have a string value"); 121 return CurStrVal; 122 } 123 int64_t getCurIntVal() const { 124 assert(CurCode == tgtok::IntVal && "This token isn't an integer"); 125 return CurIntVal; 126 } 127 std::pair<int64_t, unsigned> getCurBinaryIntVal() const { 128 assert(CurCode == tgtok::BinaryIntVal && 129 "This token isn't a binary integer"); 130 return std::make_pair(CurIntVal, (CurPtr - TokStart)-2); 131 } 132 133 SMLoc getLoc() const; 134 SMRange getLocRange() const; 135 136 private: 137 /// LexToken - Read the next token and return its code. 138 tgtok::TokKind LexToken(bool FileOrLineStart = false); 139 140 tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg); 141 tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg); 142 143 int getNextChar(); 144 int peekNextChar(int Index) const; 145 void SkipBCPLComment(); 146 bool SkipCComment(); 147 tgtok::TokKind LexIdentifier(); 148 bool LexInclude(); 149 tgtok::TokKind LexString(); 150 tgtok::TokKind LexVarName(); 151 tgtok::TokKind LexNumber(); 152 tgtok::TokKind LexBracket(); 153 tgtok::TokKind LexExclaim(); 154 155 // Process EOF encountered in LexToken(). 156 // If EOF is met in an include file, then the method will update 157 // CurPtr, CurBuf and preprocessing include stack, and return true. 158 // If EOF is met in the top-level file, then the method will 159 // update and check the preprocessing include stack, and return false. 160 bool processEOF(); 161 162 // *** Structures and methods for preprocessing support *** 163 164 // A set of macro names that are defined either via command line or 165 // by using: 166 // #define NAME 167 StringSet<> DefinedMacros; 168 169 // Each of #ifdef and #else directives has a descriptor associated 170 // with it. 171 // 172 // An ordered list of preprocessing controls defined by #ifdef/#else 173 // directives that are in effect currently is called preprocessing 174 // control stack. It is represented as a vector of PreprocessorControlDesc's. 175 // 176 // The control stack is updated according to the following rules: 177 // 178 // For each #ifdef we add an element to the control stack. 179 // For each #else we replace the top element with a descriptor 180 // with an inverted IsDefined value. 181 // For each #endif we pop the top element from the control stack. 182 // 183 // When CurPtr reaches the current buffer's end, the control stack 184 // must be empty, i.e. #ifdef and the corresponding #endif 185 // must be located in the same file. 186 struct PreprocessorControlDesc { 187 // Either tgtok::Ifdef or tgtok::Else. 188 tgtok::TokKind Kind; 189 190 // True, if the condition for this directive is true, false - otherwise. 191 // Examples: 192 // #ifdef NAME : true, if NAME is defined, false - otherwise. 193 // ... 194 // #else : false, if NAME is defined, true - otherwise. 195 bool IsDefined; 196 197 // Pointer into CurBuf to the beginning of the preprocessing directive 198 // word, e.g.: 199 // #ifdef NAME 200 // ^ - SrcPos 201 SMLoc SrcPos; 202 }; 203 204 // We want to disallow code like this: 205 // file1.td: 206 // #define NAME 207 // #ifdef NAME 208 // include "file2.td" 209 // EOF 210 // file2.td: 211 // #endif 212 // EOF 213 // 214 // To do this, we clear the preprocessing control stack on entry 215 // to each of the included file. PrepIncludeStack is used to store 216 // preprocessing control stacks for the current file and all its 217 // parent files. The back() element is the preprocessing control 218 // stack for the current file. 219 std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>> 220 PrepIncludeStack; 221 222 // Validate that the current preprocessing control stack is empty, 223 // since we are about to exit a file, and pop the include stack. 224 // 225 // If IncludeStackMustBeEmpty is true, the include stack must be empty 226 // after the popping, otherwise, the include stack must not be empty 227 // after the popping. Basically, the include stack must be empty 228 // only if we exit the "top-level" file (i.e. finish lexing). 229 // 230 // The method returns false, if the current preprocessing control stack 231 // is not empty (e.g. there is an unterminated #ifdef/#else), 232 // true - otherwise. 233 bool prepExitInclude(bool IncludeStackMustBeEmpty); 234 235 // Look ahead for a preprocessing directive starting from CurPtr. The caller 236 // must only call this method, if *(CurPtr - 1) is '#'. If the method matches 237 // a preprocessing directive word followed by a whitespace, then it returns 238 // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define. 239 // 240 // CurPtr is not adjusted by this method. 241 tgtok::TokKind prepIsDirective() const; 242 243 // Given a preprocessing token kind, adjusts CurPtr to the end 244 // of the preprocessing directive word. Returns true, unless 245 // an unsupported token kind is passed in. 246 // 247 // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective() 248 // to avoid adjusting CurPtr before we are sure that '#' is followed 249 // by a preprocessing directive. If it is not, then we fall back to 250 // tgtok::paste interpretation of '#'. 251 bool prepEatPreprocessorDirective(tgtok::TokKind Kind); 252 253 // The main "exit" point from the token parsing to preprocessor. 254 // 255 // The method is called for CurPtr, when prepIsDirective() returns 256 // true. The first parameter matches the result of prepIsDirective(), 257 // denoting the actual preprocessor directive to be processed. 258 // 259 // If the preprocessing directive disables the tokens processing, e.g.: 260 // #ifdef NAME // NAME is undefined 261 // then lexPreprocessor() enters the lines-skipping mode. 262 // In this mode, it does not parse any tokens, because the code under 263 // the #ifdef may not even be a correct tablegen code. The preprocessor 264 // looks for lines containing other preprocessing directives, which 265 // may be prepended with whitespaces and C-style comments. If the line 266 // does not contain a preprocessing directive, it is skipped completely. 267 // Otherwise, the preprocessing directive is processed by recursively 268 // calling lexPreprocessor(). The processing of the encountered 269 // preprocessing directives includes updating preprocessing control stack 270 // and adding new macros into DefinedMacros set. 271 // 272 // The second parameter controls whether lexPreprocessor() is called from 273 // LexToken() (true) or recursively from lexPreprocessor() (false). 274 // 275 // If ReturnNextLiveToken is true, the method returns the next 276 // LEX token following the current directive or following the end 277 // of the disabled preprocessing region corresponding to this directive. 278 // If ReturnNextLiveToken is false, the method returns the first parameter, 279 // unless there were errors encountered in the disabled preprocessing 280 // region - in this case, it returns tgtok::Error. 281 tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind, 282 bool ReturnNextLiveToken = true); 283 284 // Worker method for lexPreprocessor() to skip lines after some 285 // preprocessing directive up to the buffer end or to the directive 286 // that re-enables token processing. The method returns true 287 // upon processing the next directive that re-enables tokens 288 // processing. False is returned if an error was encountered. 289 // 290 // Note that prepSkipRegion() calls lexPreprocessor() to process 291 // encountered preprocessing directives. In this case, the second 292 // parameter to lexPreprocessor() is set to false. Being passed 293 // false ReturnNextLiveToken, lexPreprocessor() must never call 294 // prepSkipRegion(). We assert this by passing ReturnNextLiveToken 295 // to prepSkipRegion() and checking that it is never set to false. 296 bool prepSkipRegion(bool MustNeverBeFalse); 297 298 // Lex name of the macro after either #ifdef or #define. We could have used 299 // LexIdentifier(), but it has special handling of "include" word, which 300 // could result in awkward diagnostic errors. Consider: 301 // ---- 302 // #ifdef include 303 // class ... 304 // ---- 305 // LexIdentifier() will engage LexInclude(), which will complain about 306 // missing file with name "class". Instead, prepLexMacroName() will treat 307 // "include" as a normal macro name. 308 // 309 // On entry, CurPtr points to the end of a preprocessing directive word. 310 // The method allows for whitespaces between the preprocessing directive 311 // and the macro name. The allowed whitespaces are ' ' and '\t'. 312 // 313 // If the first non-whitespace symbol after the preprocessing directive 314 // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then 315 // the method updates TokStart to the position of the first non-whitespace 316 // symbol, sets CurPtr to the position of the macro name's last symbol, 317 // and returns a string reference to the macro name. Otherwise, 318 // TokStart is set to the first non-whitespace symbol after the preprocessing 319 // directive, and the method returns an empty string reference. 320 // 321 // In all cases, TokStart may be used to point to the word following 322 // the preprocessing directive. 323 StringRef prepLexMacroName(); 324 325 // Skip any whitespaces starting from CurPtr. The method is used 326 // only in the lines-skipping mode to find the first non-whitespace 327 // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n' 328 // and '\r'. The method skips C-style comments as well, because 329 // it is used to find the beginning of the preprocessing directive. 330 // If we do not handle C-style comments the following code would 331 // result in incorrect detection of a preprocessing directive: 332 // /* 333 // #ifdef NAME 334 // */ 335 // As long as we skip C-style comments, the following code is correctly 336 // recognized as a preprocessing directive: 337 // /* first line comment 338 // second line comment */ #ifdef NAME 339 // 340 // The method returns true upon reaching the first non-whitespace symbol 341 // or EOF, CurPtr is set to point to this symbol. The method returns false, 342 // if an error occurred during skipping of a C-style comment. 343 bool prepSkipLineBegin(); 344 345 // Skip any whitespaces or comments after a preprocessing directive. 346 // The method returns true upon reaching either end of the line 347 // or end of the file. If there is a multiline C-style comment 348 // after the preprocessing directive, the method skips 349 // the comment, so the final CurPtr may point to one of the next lines. 350 // The method returns false, if an error occurred during skipping 351 // C- or C++-style comment, or a non-whitespace symbol appears 352 // after the preprocessing directive. 353 // 354 // The method maybe called both during lines-skipping and tokens 355 // processing. It actually verifies that only whitespaces or/and 356 // comments follow a preprocessing directive. 357 // 358 // After the execution of this mehod, CurPtr points either to new line 359 // symbol, buffer end or non-whitespace symbol following the preprocesing 360 // directive. 361 bool prepSkipDirectiveEnd(); 362 363 // Skip all symbols to the end of the line/file. 364 // The method adjusts CurPtr, so that it points to either new line 365 // symbol in the current line or the buffer end. 366 void prepSkipToLineEnd(); 367 368 // Return true, if the current preprocessor control stack is such that 369 // we should allow lexer to process the next token, false - otherwise. 370 // 371 // In particular, the method returns true, if all the #ifdef/#else 372 // controls on the stack have their IsDefined member set to true. 373 bool prepIsProcessingEnabled(); 374 375 // Report an error, if we reach EOF with non-empty preprocessing control 376 // stack. This means there is no matching #endif for the previous 377 // #ifdef/#else. 378 void prepReportPreprocessorStackError(); 379 }; 380 381 } // end namespace llvm 382 383 #endif 384