1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This class represents the Lexer for tablegen files. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H 14 #define LLVM_LIB_TABLEGEN_TGLEXER_H 15 16 #include "llvm/ADT/StringRef.h" 17 #include "llvm/ADT/StringSet.h" 18 #include "llvm/Support/DataTypes.h" 19 #include "llvm/Support/SMLoc.h" 20 #include <cassert> 21 #include <memory> 22 #include <set> 23 #include <string> 24 #include <vector> 25 26 namespace llvm { 27 template <typename T> class ArrayRef; 28 class SourceMgr; 29 class Twine; 30 31 namespace tgtok { 32 enum TokKind { 33 // Markers 34 Eof, Error, 35 36 // Tokens with no info. 37 minus, plus, // - + 38 l_square, r_square, // [ ] 39 l_brace, r_brace, // { } 40 l_paren, r_paren, // ( ) 41 less, greater, // < > 42 colon, semi, // : ; 43 comma, period, // , . 44 equal, question, // = ? 45 paste, // # 46 47 // Keywords. ('ElseKW' is named to distinguish it from the existing 'Else' 48 // that means the preprocessor #else.) 49 Bit, Bits, Class, Code, Dag, Def, Foreach, Defm, Field, In, Int, Let, List, 50 MultiClass, String, Defset, Defvar, If, Then, ElseKW, 51 52 // !keywords. 53 XConcat, XADD, XMUL, XAND, XOR, XSRA, XSRL, XSHL, XListConcat, XListSplat, 54 XStrConcat, XCast, XSubst, XForEach, XFoldl, XHead, XTail, XSize, XEmpty, 55 XIf, XCond, XEq, XIsA, XDag, XNe, XLe, XLt, XGe, XGt, XSetOp, XGetOp, 56 57 // Integer value. 58 IntVal, 59 60 // Binary constant. Note that these are sized according to the number of 61 // bits given. 62 BinaryIntVal, 63 64 // String valued tokens. 65 Id, StrVal, VarName, CodeFragment, 66 67 // Preprocessing tokens for internal usage by the lexer. 68 // They are never returned as a result of Lex(). 69 Ifdef, Ifndef, Else, Endif, Define 70 }; 71 } 72 73 /// TGLexer - TableGen Lexer class. 74 class TGLexer { 75 SourceMgr &SrcMgr; 76 77 const char *CurPtr = nullptr; 78 StringRef CurBuf; 79 80 // Information about the current token. 81 const char *TokStart = nullptr; 82 tgtok::TokKind CurCode = tgtok::TokKind::Eof; 83 std::string CurStrVal; // This is valid for ID, STRVAL, VARNAME, CODEFRAGMENT 84 int64_t CurIntVal = 0; // This is valid for INTVAL. 85 86 /// CurBuffer - This is the current buffer index we're lexing from as managed 87 /// by the SourceMgr object. 88 unsigned CurBuffer = 0; 89 90 public: 91 typedef std::set<std::string> DependenciesSetTy; 92 93 private: 94 /// Dependencies - This is the list of all included files. 95 DependenciesSetTy Dependencies; 96 97 public: 98 TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros); 99 100 tgtok::TokKind Lex() { 101 return CurCode = LexToken(CurPtr == CurBuf.begin()); 102 } 103 104 const DependenciesSetTy &getDependencies() const { 105 return Dependencies; 106 } 107 108 tgtok::TokKind getCode() const { return CurCode; } 109 110 const std::string &getCurStrVal() const { 111 assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal || 112 CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) && 113 "This token doesn't have a string value"); 114 return CurStrVal; 115 } 116 int64_t getCurIntVal() const { 117 assert(CurCode == tgtok::IntVal && "This token isn't an integer"); 118 return CurIntVal; 119 } 120 std::pair<int64_t, unsigned> getCurBinaryIntVal() const { 121 assert(CurCode == tgtok::BinaryIntVal && 122 "This token isn't a binary integer"); 123 return std::make_pair(CurIntVal, (CurPtr - TokStart)-2); 124 } 125 126 SMLoc getLoc() const; 127 128 private: 129 /// LexToken - Read the next token and return its code. 130 tgtok::TokKind LexToken(bool FileOrLineStart = false); 131 132 tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg); 133 tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg); 134 135 int getNextChar(); 136 int peekNextChar(int Index) const; 137 void SkipBCPLComment(); 138 bool SkipCComment(); 139 tgtok::TokKind LexIdentifier(); 140 bool LexInclude(); 141 tgtok::TokKind LexString(); 142 tgtok::TokKind LexVarName(); 143 tgtok::TokKind LexNumber(); 144 tgtok::TokKind LexBracket(); 145 tgtok::TokKind LexExclaim(); 146 147 // Process EOF encountered in LexToken(). 148 // If EOF is met in an include file, then the method will update 149 // CurPtr, CurBuf and preprocessing include stack, and return true. 150 // If EOF is met in the top-level file, then the method will 151 // update and check the preprocessing include stack, and return false. 152 bool processEOF(); 153 154 // *** Structures and methods for preprocessing support *** 155 156 // A set of macro names that are defined either via command line or 157 // by using: 158 // #define NAME 159 StringSet<> DefinedMacros; 160 161 // Each of #ifdef and #else directives has a descriptor associated 162 // with it. 163 // 164 // An ordered list of preprocessing controls defined by #ifdef/#else 165 // directives that are in effect currently is called preprocessing 166 // control stack. It is represented as a vector of PreprocessorControlDesc's. 167 // 168 // The control stack is updated according to the following rules: 169 // 170 // For each #ifdef we add an element to the control stack. 171 // For each #else we replace the top element with a descriptor 172 // with an inverted IsDefined value. 173 // For each #endif we pop the top element from the control stack. 174 // 175 // When CurPtr reaches the current buffer's end, the control stack 176 // must be empty, i.e. #ifdef and the corresponding #endif 177 // must be located in the same file. 178 struct PreprocessorControlDesc { 179 // Either tgtok::Ifdef or tgtok::Else. 180 tgtok::TokKind Kind; 181 182 // True, if the condition for this directive is true, false - otherwise. 183 // Examples: 184 // #ifdef NAME : true, if NAME is defined, false - otherwise. 185 // ... 186 // #else : false, if NAME is defined, true - otherwise. 187 bool IsDefined; 188 189 // Pointer into CurBuf to the beginning of the preprocessing directive 190 // word, e.g.: 191 // #ifdef NAME 192 // ^ - SrcPos 193 SMLoc SrcPos; 194 }; 195 196 // We want to disallow code like this: 197 // file1.td: 198 // #define NAME 199 // #ifdef NAME 200 // include "file2.td" 201 // EOF 202 // file2.td: 203 // #endif 204 // EOF 205 // 206 // To do this, we clear the preprocessing control stack on entry 207 // to each of the included file. PrepIncludeStack is used to store 208 // preprocessing control stacks for the current file and all its 209 // parent files. The back() element is the preprocessing control 210 // stack for the current file. 211 std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>> 212 PrepIncludeStack; 213 214 // Validate that the current preprocessing control stack is empty, 215 // since we are about to exit a file, and pop the include stack. 216 // 217 // If IncludeStackMustBeEmpty is true, the include stack must be empty 218 // after the popping, otherwise, the include stack must not be empty 219 // after the popping. Basically, the include stack must be empty 220 // only if we exit the "top-level" file (i.e. finish lexing). 221 // 222 // The method returns false, if the current preprocessing control stack 223 // is not empty (e.g. there is an unterminated #ifdef/#else), 224 // true - otherwise. 225 bool prepExitInclude(bool IncludeStackMustBeEmpty); 226 227 // Look ahead for a preprocessing directive starting from CurPtr. The caller 228 // must only call this method, if *(CurPtr - 1) is '#'. If the method matches 229 // a preprocessing directive word followed by a whitespace, then it returns 230 // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define. 231 // 232 // CurPtr is not adjusted by this method. 233 tgtok::TokKind prepIsDirective() const; 234 235 // Given a preprocessing token kind, adjusts CurPtr to the end 236 // of the preprocessing directive word. Returns true, unless 237 // an unsupported token kind is passed in. 238 // 239 // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective() 240 // to avoid adjusting CurPtr before we are sure that '#' is followed 241 // by a preprocessing directive. If it is not, then we fall back to 242 // tgtok::paste interpretation of '#'. 243 bool prepEatPreprocessorDirective(tgtok::TokKind Kind); 244 245 // The main "exit" point from the token parsing to preprocessor. 246 // 247 // The method is called for CurPtr, when prepIsDirective() returns 248 // true. The first parameter matches the result of prepIsDirective(), 249 // denoting the actual preprocessor directive to be processed. 250 // 251 // If the preprocessing directive disables the tokens processing, e.g.: 252 // #ifdef NAME // NAME is undefined 253 // then lexPreprocessor() enters the lines-skipping mode. 254 // In this mode, it does not parse any tokens, because the code under 255 // the #ifdef may not even be a correct tablegen code. The preprocessor 256 // looks for lines containing other preprocessing directives, which 257 // may be prepended with whitespaces and C-style comments. If the line 258 // does not contain a preprocessing directive, it is skipped completely. 259 // Otherwise, the preprocessing directive is processed by recursively 260 // calling lexPreprocessor(). The processing of the encountered 261 // preprocessing directives includes updating preprocessing control stack 262 // and adding new macros into DefinedMacros set. 263 // 264 // The second parameter controls whether lexPreprocessor() is called from 265 // LexToken() (true) or recursively from lexPreprocessor() (false). 266 // 267 // If ReturnNextLiveToken is true, the method returns the next 268 // LEX token following the current directive or following the end 269 // of the disabled preprocessing region corresponding to this directive. 270 // If ReturnNextLiveToken is false, the method returns the first parameter, 271 // unless there were errors encountered in the disabled preprocessing 272 // region - in this case, it returns tgtok::Error. 273 tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind, 274 bool ReturnNextLiveToken = true); 275 276 // Worker method for lexPreprocessor() to skip lines after some 277 // preprocessing directive up to the buffer end or to the directive 278 // that re-enables token processing. The method returns true 279 // upon processing the next directive that re-enables tokens 280 // processing. False is returned if an error was encountered. 281 // 282 // Note that prepSkipRegion() calls lexPreprocessor() to process 283 // encountered preprocessing directives. In this case, the second 284 // parameter to lexPreprocessor() is set to false. Being passed 285 // false ReturnNextLiveToken, lexPreprocessor() must never call 286 // prepSkipRegion(). We assert this by passing ReturnNextLiveToken 287 // to prepSkipRegion() and checking that it is never set to false. 288 bool prepSkipRegion(bool MustNeverBeFalse); 289 290 // Lex name of the macro after either #ifdef or #define. We could have used 291 // LexIdentifier(), but it has special handling of "include" word, which 292 // could result in awkward diagnostic errors. Consider: 293 // ---- 294 // #ifdef include 295 // class ... 296 // ---- 297 // LexIdentifier() will engage LexInclude(), which will complain about 298 // missing file with name "class". Instead, prepLexMacroName() will treat 299 // "include" as a normal macro name. 300 // 301 // On entry, CurPtr points to the end of a preprocessing directive word. 302 // The method allows for whitespaces between the preprocessing directive 303 // and the macro name. The allowed whitespaces are ' ' and '\t'. 304 // 305 // If the first non-whitespace symbol after the preprocessing directive 306 // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then 307 // the method updates TokStart to the position of the first non-whitespace 308 // symbol, sets CurPtr to the position of the macro name's last symbol, 309 // and returns a string reference to the macro name. Otherwise, 310 // TokStart is set to the first non-whitespace symbol after the preprocessing 311 // directive, and the method returns an empty string reference. 312 // 313 // In all cases, TokStart may be used to point to the word following 314 // the preprocessing directive. 315 StringRef prepLexMacroName(); 316 317 // Skip any whitespaces starting from CurPtr. The method is used 318 // only in the lines-skipping mode to find the first non-whitespace 319 // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n' 320 // and '\r'. The method skips C-style comments as well, because 321 // it is used to find the beginning of the preprocessing directive. 322 // If we do not handle C-style comments the following code would 323 // result in incorrect detection of a preprocessing directive: 324 // /* 325 // #ifdef NAME 326 // */ 327 // As long as we skip C-style comments, the following code is correctly 328 // recognized as a preprocessing directive: 329 // /* first line comment 330 // second line comment */ #ifdef NAME 331 // 332 // The method returns true upon reaching the first non-whitespace symbol 333 // or EOF, CurPtr is set to point to this symbol. The method returns false, 334 // if an error occured during skipping of a C-style comment. 335 bool prepSkipLineBegin(); 336 337 // Skip any whitespaces or comments after a preprocessing directive. 338 // The method returns true upon reaching either end of the line 339 // or end of the file. If there is a multiline C-style comment 340 // after the preprocessing directive, the method skips 341 // the comment, so the final CurPtr may point to one of the next lines. 342 // The method returns false, if an error occured during skipping 343 // C- or C++-style comment, or a non-whitespace symbol appears 344 // after the preprocessing directive. 345 // 346 // The method maybe called both during lines-skipping and tokens 347 // processing. It actually verifies that only whitespaces or/and 348 // comments follow a preprocessing directive. 349 // 350 // After the execution of this mehod, CurPtr points either to new line 351 // symbol, buffer end or non-whitespace symbol following the preprocesing 352 // directive. 353 bool prepSkipDirectiveEnd(); 354 355 // Skip all symbols to the end of the line/file. 356 // The method adjusts CurPtr, so that it points to either new line 357 // symbol in the current line or the buffer end. 358 void prepSkipToLineEnd(); 359 360 // Return true, if the current preprocessor control stack is such that 361 // we should allow lexer to process the next token, false - otherwise. 362 // 363 // In particular, the method returns true, if all the #ifdef/#else 364 // controls on the stack have their IsDefined member set to true. 365 bool prepIsProcessingEnabled(); 366 367 // Report an error, if we reach EOF with non-empty preprocessing control 368 // stack. This means there is no matching #endif for the previous 369 // #ifdef/#else. 370 void prepReportPreprocessorStackError(); 371 }; 372 373 } // end namespace llvm 374 375 #endif 376