1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This class represents the Lexer for tablegen files. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H 14 #define LLVM_LIB_TABLEGEN_TGLEXER_H 15 16 #include "llvm/ADT/StringRef.h" 17 #include "llvm/ADT/StringSet.h" 18 #include "llvm/Support/DataTypes.h" 19 #include "llvm/Support/SMLoc.h" 20 #include <cassert> 21 #include <memory> 22 #include <set> 23 #include <string> 24 #include <vector> 25 26 namespace llvm { 27 template <typename T> class ArrayRef; 28 class SourceMgr; 29 class Twine; 30 31 namespace tgtok { 32 enum TokKind { 33 // Markers 34 Eof, Error, 35 36 // Tokens with no info. 37 minus, plus, // - + 38 l_square, r_square, // [ ] 39 l_brace, r_brace, // { } 40 l_paren, r_paren, // ( ) 41 less, greater, // < > 42 colon, semi, // : ; 43 comma, dot, // , . 44 equal, question, // = ? 45 paste, // # 46 dotdotdot, // ... 47 48 // Reserved keywords. ('ElseKW' is named to distinguish it from the 49 // existing 'Else' that means the preprocessor #else.) 50 Assert, Bit, Bits, Class, Code, Dag, Def, Defm, Defset, Defvar, ElseKW, 51 FalseKW, Field, Foreach, If, In, Include, Int, Let, List, MultiClass, 52 String, Then, TrueKW, 53 54 // Bang operators. 55 XConcat, XADD, XSUB, XMUL, XNOT, XAND, XOR, XXOR, XSRA, XSRL, XSHL, 56 XListConcat, XListSplat, XStrConcat, XInterleave, XSubstr, XCast, 57 XSubst, XForEach, XFilter, XFoldl, XHead, XTail, XSize, XEmpty, XIf, 58 XCond, XEq, XIsA, XDag, XNe, XLe, XLt, XGe, XGt, XSetDagOp, XGetDagOp, 59 60 // Boolean literals. 61 TrueVal, FalseVal, 62 63 // Integer value. 64 IntVal, 65 66 // Binary constant. Note that these are sized according to the number of 67 // bits given. 68 BinaryIntVal, 69 70 // String valued tokens. 71 Id, StrVal, VarName, CodeFragment, 72 73 // Preprocessing tokens for internal usage by the lexer. 74 // They are never returned as a result of Lex(). 75 Ifdef, Ifndef, Else, Endif, Define 76 }; 77 } 78 79 /// TGLexer - TableGen Lexer class. 80 class TGLexer { 81 SourceMgr &SrcMgr; 82 83 const char *CurPtr = nullptr; 84 StringRef CurBuf; 85 86 // Information about the current token. 87 const char *TokStart = nullptr; 88 tgtok::TokKind CurCode = tgtok::TokKind::Eof; 89 std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment 90 int64_t CurIntVal = 0; // This is valid for IntVal. 91 92 /// CurBuffer - This is the current buffer index we're lexing from as managed 93 /// by the SourceMgr object. 94 unsigned CurBuffer = 0; 95 96 public: 97 typedef std::set<std::string> DependenciesSetTy; 98 99 private: 100 /// Dependencies - This is the list of all included files. 101 DependenciesSetTy Dependencies; 102 103 public: 104 TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros); 105 106 tgtok::TokKind Lex() { 107 return CurCode = LexToken(CurPtr == CurBuf.begin()); 108 } 109 110 const DependenciesSetTy &getDependencies() const { 111 return Dependencies; 112 } 113 114 tgtok::TokKind getCode() const { return CurCode; } 115 116 const std::string &getCurStrVal() const { 117 assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal || 118 CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) && 119 "This token doesn't have a string value"); 120 return CurStrVal; 121 } 122 int64_t getCurIntVal() const { 123 assert(CurCode == tgtok::IntVal && "This token isn't an integer"); 124 return CurIntVal; 125 } 126 std::pair<int64_t, unsigned> getCurBinaryIntVal() const { 127 assert(CurCode == tgtok::BinaryIntVal && 128 "This token isn't a binary integer"); 129 return std::make_pair(CurIntVal, (CurPtr - TokStart)-2); 130 } 131 132 SMLoc getLoc() const; 133 134 private: 135 /// LexToken - Read the next token and return its code. 136 tgtok::TokKind LexToken(bool FileOrLineStart = false); 137 138 tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg); 139 tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg); 140 141 int getNextChar(); 142 int peekNextChar(int Index) const; 143 void SkipBCPLComment(); 144 bool SkipCComment(); 145 tgtok::TokKind LexIdentifier(); 146 bool LexInclude(); 147 tgtok::TokKind LexString(); 148 tgtok::TokKind LexVarName(); 149 tgtok::TokKind LexNumber(); 150 tgtok::TokKind LexBracket(); 151 tgtok::TokKind LexExclaim(); 152 153 // Process EOF encountered in LexToken(). 154 // If EOF is met in an include file, then the method will update 155 // CurPtr, CurBuf and preprocessing include stack, and return true. 156 // If EOF is met in the top-level file, then the method will 157 // update and check the preprocessing include stack, and return false. 158 bool processEOF(); 159 160 // *** Structures and methods for preprocessing support *** 161 162 // A set of macro names that are defined either via command line or 163 // by using: 164 // #define NAME 165 StringSet<> DefinedMacros; 166 167 // Each of #ifdef and #else directives has a descriptor associated 168 // with it. 169 // 170 // An ordered list of preprocessing controls defined by #ifdef/#else 171 // directives that are in effect currently is called preprocessing 172 // control stack. It is represented as a vector of PreprocessorControlDesc's. 173 // 174 // The control stack is updated according to the following rules: 175 // 176 // For each #ifdef we add an element to the control stack. 177 // For each #else we replace the top element with a descriptor 178 // with an inverted IsDefined value. 179 // For each #endif we pop the top element from the control stack. 180 // 181 // When CurPtr reaches the current buffer's end, the control stack 182 // must be empty, i.e. #ifdef and the corresponding #endif 183 // must be located in the same file. 184 struct PreprocessorControlDesc { 185 // Either tgtok::Ifdef or tgtok::Else. 186 tgtok::TokKind Kind; 187 188 // True, if the condition for this directive is true, false - otherwise. 189 // Examples: 190 // #ifdef NAME : true, if NAME is defined, false - otherwise. 191 // ... 192 // #else : false, if NAME is defined, true - otherwise. 193 bool IsDefined; 194 195 // Pointer into CurBuf to the beginning of the preprocessing directive 196 // word, e.g.: 197 // #ifdef NAME 198 // ^ - SrcPos 199 SMLoc SrcPos; 200 }; 201 202 // We want to disallow code like this: 203 // file1.td: 204 // #define NAME 205 // #ifdef NAME 206 // include "file2.td" 207 // EOF 208 // file2.td: 209 // #endif 210 // EOF 211 // 212 // To do this, we clear the preprocessing control stack on entry 213 // to each of the included file. PrepIncludeStack is used to store 214 // preprocessing control stacks for the current file and all its 215 // parent files. The back() element is the preprocessing control 216 // stack for the current file. 217 std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>> 218 PrepIncludeStack; 219 220 // Validate that the current preprocessing control stack is empty, 221 // since we are about to exit a file, and pop the include stack. 222 // 223 // If IncludeStackMustBeEmpty is true, the include stack must be empty 224 // after the popping, otherwise, the include stack must not be empty 225 // after the popping. Basically, the include stack must be empty 226 // only if we exit the "top-level" file (i.e. finish lexing). 227 // 228 // The method returns false, if the current preprocessing control stack 229 // is not empty (e.g. there is an unterminated #ifdef/#else), 230 // true - otherwise. 231 bool prepExitInclude(bool IncludeStackMustBeEmpty); 232 233 // Look ahead for a preprocessing directive starting from CurPtr. The caller 234 // must only call this method, if *(CurPtr - 1) is '#'. If the method matches 235 // a preprocessing directive word followed by a whitespace, then it returns 236 // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define. 237 // 238 // CurPtr is not adjusted by this method. 239 tgtok::TokKind prepIsDirective() const; 240 241 // Given a preprocessing token kind, adjusts CurPtr to the end 242 // of the preprocessing directive word. Returns true, unless 243 // an unsupported token kind is passed in. 244 // 245 // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective() 246 // to avoid adjusting CurPtr before we are sure that '#' is followed 247 // by a preprocessing directive. If it is not, then we fall back to 248 // tgtok::paste interpretation of '#'. 249 bool prepEatPreprocessorDirective(tgtok::TokKind Kind); 250 251 // The main "exit" point from the token parsing to preprocessor. 252 // 253 // The method is called for CurPtr, when prepIsDirective() returns 254 // true. The first parameter matches the result of prepIsDirective(), 255 // denoting the actual preprocessor directive to be processed. 256 // 257 // If the preprocessing directive disables the tokens processing, e.g.: 258 // #ifdef NAME // NAME is undefined 259 // then lexPreprocessor() enters the lines-skipping mode. 260 // In this mode, it does not parse any tokens, because the code under 261 // the #ifdef may not even be a correct tablegen code. The preprocessor 262 // looks for lines containing other preprocessing directives, which 263 // may be prepended with whitespaces and C-style comments. If the line 264 // does not contain a preprocessing directive, it is skipped completely. 265 // Otherwise, the preprocessing directive is processed by recursively 266 // calling lexPreprocessor(). The processing of the encountered 267 // preprocessing directives includes updating preprocessing control stack 268 // and adding new macros into DefinedMacros set. 269 // 270 // The second parameter controls whether lexPreprocessor() is called from 271 // LexToken() (true) or recursively from lexPreprocessor() (false). 272 // 273 // If ReturnNextLiveToken is true, the method returns the next 274 // LEX token following the current directive or following the end 275 // of the disabled preprocessing region corresponding to this directive. 276 // If ReturnNextLiveToken is false, the method returns the first parameter, 277 // unless there were errors encountered in the disabled preprocessing 278 // region - in this case, it returns tgtok::Error. 279 tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind, 280 bool ReturnNextLiveToken = true); 281 282 // Worker method for lexPreprocessor() to skip lines after some 283 // preprocessing directive up to the buffer end or to the directive 284 // that re-enables token processing. The method returns true 285 // upon processing the next directive that re-enables tokens 286 // processing. False is returned if an error was encountered. 287 // 288 // Note that prepSkipRegion() calls lexPreprocessor() to process 289 // encountered preprocessing directives. In this case, the second 290 // parameter to lexPreprocessor() is set to false. Being passed 291 // false ReturnNextLiveToken, lexPreprocessor() must never call 292 // prepSkipRegion(). We assert this by passing ReturnNextLiveToken 293 // to prepSkipRegion() and checking that it is never set to false. 294 bool prepSkipRegion(bool MustNeverBeFalse); 295 296 // Lex name of the macro after either #ifdef or #define. We could have used 297 // LexIdentifier(), but it has special handling of "include" word, which 298 // could result in awkward diagnostic errors. Consider: 299 // ---- 300 // #ifdef include 301 // class ... 302 // ---- 303 // LexIdentifier() will engage LexInclude(), which will complain about 304 // missing file with name "class". Instead, prepLexMacroName() will treat 305 // "include" as a normal macro name. 306 // 307 // On entry, CurPtr points to the end of a preprocessing directive word. 308 // The method allows for whitespaces between the preprocessing directive 309 // and the macro name. The allowed whitespaces are ' ' and '\t'. 310 // 311 // If the first non-whitespace symbol after the preprocessing directive 312 // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then 313 // the method updates TokStart to the position of the first non-whitespace 314 // symbol, sets CurPtr to the position of the macro name's last symbol, 315 // and returns a string reference to the macro name. Otherwise, 316 // TokStart is set to the first non-whitespace symbol after the preprocessing 317 // directive, and the method returns an empty string reference. 318 // 319 // In all cases, TokStart may be used to point to the word following 320 // the preprocessing directive. 321 StringRef prepLexMacroName(); 322 323 // Skip any whitespaces starting from CurPtr. The method is used 324 // only in the lines-skipping mode to find the first non-whitespace 325 // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n' 326 // and '\r'. The method skips C-style comments as well, because 327 // it is used to find the beginning of the preprocessing directive. 328 // If we do not handle C-style comments the following code would 329 // result in incorrect detection of a preprocessing directive: 330 // /* 331 // #ifdef NAME 332 // */ 333 // As long as we skip C-style comments, the following code is correctly 334 // recognized as a preprocessing directive: 335 // /* first line comment 336 // second line comment */ #ifdef NAME 337 // 338 // The method returns true upon reaching the first non-whitespace symbol 339 // or EOF, CurPtr is set to point to this symbol. The method returns false, 340 // if an error occured during skipping of a C-style comment. 341 bool prepSkipLineBegin(); 342 343 // Skip any whitespaces or comments after a preprocessing directive. 344 // The method returns true upon reaching either end of the line 345 // or end of the file. If there is a multiline C-style comment 346 // after the preprocessing directive, the method skips 347 // the comment, so the final CurPtr may point to one of the next lines. 348 // The method returns false, if an error occured during skipping 349 // C- or C++-style comment, or a non-whitespace symbol appears 350 // after the preprocessing directive. 351 // 352 // The method maybe called both during lines-skipping and tokens 353 // processing. It actually verifies that only whitespaces or/and 354 // comments follow a preprocessing directive. 355 // 356 // After the execution of this mehod, CurPtr points either to new line 357 // symbol, buffer end or non-whitespace symbol following the preprocesing 358 // directive. 359 bool prepSkipDirectiveEnd(); 360 361 // Skip all symbols to the end of the line/file. 362 // The method adjusts CurPtr, so that it points to either new line 363 // symbol in the current line or the buffer end. 364 void prepSkipToLineEnd(); 365 366 // Return true, if the current preprocessor control stack is such that 367 // we should allow lexer to process the next token, false - otherwise. 368 // 369 // In particular, the method returns true, if all the #ifdef/#else 370 // controls on the stack have their IsDefined member set to true. 371 bool prepIsProcessingEnabled(); 372 373 // Report an error, if we reach EOF with non-empty preprocessing control 374 // stack. This means there is no matching #endif for the previous 375 // #ifdef/#else. 376 void prepReportPreprocessorStackError(); 377 }; 378 379 } // end namespace llvm 380 381 #endif 382