//===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This class represents the Lexer for tablegen files. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H #define LLVM_LIB_TABLEGEN_TGLEXER_H #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSet.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/SMLoc.h" #include #include #include #include #include namespace llvm { template class ArrayRef; class SourceMgr; class Twine; namespace tgtok { enum TokKind { // Markers Eof, Error, // Tokens with no info. minus, // - plus, // + l_square, // [ r_square, // ] l_brace, // { r_brace, // } l_paren, // ( r_paren, // ) less, // < greater, // > colon, // : semi, // ; comma, // , dot, // . equal, // = question, // ? paste, // # dotdotdot, // ... // Boolean literals. TrueVal, FalseVal, // Integer value. IntVal, // Binary constant. Note that these are sized according to the number of // bits given. BinaryIntVal, // Preprocessing tokens for internal usage by the lexer. // They are never returned as a result of Lex(). Ifdef, Ifndef, Else, Endif, Define, // Reserved keywords. ('ElseKW' is named to distinguish it from the // existing 'Else' that means the preprocessor #else.) Bit, Bits, Code, Dag, ElseKW, FalseKW, Field, In, Include, Int, List, String, Then, TrueKW, // Object start tokens. OBJECT_START_FIRST, Assert = OBJECT_START_FIRST, Class, Def, Defm, Defset, Defvar, Dump, Foreach, If, Let, MultiClass, OBJECT_START_LAST = MultiClass, // Bang operators. BANG_OPERATOR_FIRST, XConcat = BANG_OPERATOR_FIRST, XADD, XSUB, XMUL, XDIV, XNOT, XLOG2, XAND, XOR, XXOR, XSRA, XSRL, XSHL, XListConcat, XListSplat, XStrConcat, XInterleave, XSubstr, XFind, XCast, XSubst, XForEach, XFilter, XFoldl, XHead, XTail, XSize, XEmpty, XIf, XCond, XEq, XIsA, XDag, XNe, XLe, XLt, XGe, XGt, XSetDagOp, XGetDagOp, XExists, XListRemove, XToLower, XToUpper, XRange, XGetDagArg, XGetDagName, XSetDagArg, XSetDagName, XRepr, BANG_OPERATOR_LAST = XRepr, // String valued tokens. STRING_VALUE_FIRST, Id = STRING_VALUE_FIRST, StrVal, VarName, CodeFragment, STRING_VALUE_LAST = CodeFragment, }; /// isBangOperator - Return true if this is a bang operator. static inline bool isBangOperator(tgtok::TokKind Kind) { return tgtok::BANG_OPERATOR_FIRST <= Kind && Kind <= BANG_OPERATOR_LAST; } /// isObjectStart - Return true if this is a valid first token for a statement. static inline bool isObjectStart(tgtok::TokKind Kind) { return tgtok::OBJECT_START_FIRST <= Kind && Kind <= OBJECT_START_LAST; } /// isStringValue - Return true if this is a string value. static inline bool isStringValue(tgtok::TokKind Kind) { return tgtok::STRING_VALUE_FIRST <= Kind && Kind <= STRING_VALUE_LAST; } } // namespace tgtok /// TGLexer - TableGen Lexer class. class TGLexer { SourceMgr &SrcMgr; const char *CurPtr = nullptr; StringRef CurBuf; // Information about the current token. const char *TokStart = nullptr; tgtok::TokKind CurCode = tgtok::TokKind::Eof; std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment int64_t CurIntVal = 0; // This is valid for IntVal. /// CurBuffer - This is the current buffer index we're lexing from as managed /// by the SourceMgr object. unsigned CurBuffer = 0; public: typedef std::set DependenciesSetTy; private: /// Dependencies - This is the list of all included files. DependenciesSetTy Dependencies; public: TGLexer(SourceMgr &SrcMgr, ArrayRef Macros); tgtok::TokKind Lex() { return CurCode = LexToken(CurPtr == CurBuf.begin()); } const DependenciesSetTy &getDependencies() const { return Dependencies; } tgtok::TokKind getCode() const { return CurCode; } const std::string &getCurStrVal() const { assert(tgtok::isStringValue(CurCode) && "This token doesn't have a string value"); return CurStrVal; } int64_t getCurIntVal() const { assert(CurCode == tgtok::IntVal && "This token isn't an integer"); return CurIntVal; } std::pair getCurBinaryIntVal() const { assert(CurCode == tgtok::BinaryIntVal && "This token isn't a binary integer"); return std::make_pair(CurIntVal, (CurPtr - TokStart)-2); } SMLoc getLoc() const; SMRange getLocRange() const; private: /// LexToken - Read the next token and return its code. tgtok::TokKind LexToken(bool FileOrLineStart = false); tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg); tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg); int getNextChar(); int peekNextChar(int Index) const; void SkipBCPLComment(); bool SkipCComment(); tgtok::TokKind LexIdentifier(); bool LexInclude(); tgtok::TokKind LexString(); tgtok::TokKind LexVarName(); tgtok::TokKind LexNumber(); tgtok::TokKind LexBracket(); tgtok::TokKind LexExclaim(); // Process EOF encountered in LexToken(). // If EOF is met in an include file, then the method will update // CurPtr, CurBuf and preprocessing include stack, and return true. // If EOF is met in the top-level file, then the method will // update and check the preprocessing include stack, and return false. bool processEOF(); // *** Structures and methods for preprocessing support *** // A set of macro names that are defined either via command line or // by using: // #define NAME StringSet<> DefinedMacros; // Each of #ifdef and #else directives has a descriptor associated // with it. // // An ordered list of preprocessing controls defined by #ifdef/#else // directives that are in effect currently is called preprocessing // control stack. It is represented as a vector of PreprocessorControlDesc's. // // The control stack is updated according to the following rules: // // For each #ifdef we add an element to the control stack. // For each #else we replace the top element with a descriptor // with an inverted IsDefined value. // For each #endif we pop the top element from the control stack. // // When CurPtr reaches the current buffer's end, the control stack // must be empty, i.e. #ifdef and the corresponding #endif // must be located in the same file. struct PreprocessorControlDesc { // Either tgtok::Ifdef or tgtok::Else. tgtok::TokKind Kind; // True, if the condition for this directive is true, false - otherwise. // Examples: // #ifdef NAME : true, if NAME is defined, false - otherwise. // ... // #else : false, if NAME is defined, true - otherwise. bool IsDefined; // Pointer into CurBuf to the beginning of the preprocessing directive // word, e.g.: // #ifdef NAME // ^ - SrcPos SMLoc SrcPos; }; // We want to disallow code like this: // file1.td: // #define NAME // #ifdef NAME // include "file2.td" // EOF // file2.td: // #endif // EOF // // To do this, we clear the preprocessing control stack on entry // to each of the included file. PrepIncludeStack is used to store // preprocessing control stacks for the current file and all its // parent files. The back() element is the preprocessing control // stack for the current file. std::vector>> PrepIncludeStack; // Validate that the current preprocessing control stack is empty, // since we are about to exit a file, and pop the include stack. // // If IncludeStackMustBeEmpty is true, the include stack must be empty // after the popping, otherwise, the include stack must not be empty // after the popping. Basically, the include stack must be empty // only if we exit the "top-level" file (i.e. finish lexing). // // The method returns false, if the current preprocessing control stack // is not empty (e.g. there is an unterminated #ifdef/#else), // true - otherwise. bool prepExitInclude(bool IncludeStackMustBeEmpty); // Look ahead for a preprocessing directive starting from CurPtr. The caller // must only call this method, if *(CurPtr - 1) is '#'. If the method matches // a preprocessing directive word followed by a whitespace, then it returns // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define. // // CurPtr is not adjusted by this method. tgtok::TokKind prepIsDirective() const; // Given a preprocessing token kind, adjusts CurPtr to the end // of the preprocessing directive word. Returns true, unless // an unsupported token kind is passed in. // // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective() // to avoid adjusting CurPtr before we are sure that '#' is followed // by a preprocessing directive. If it is not, then we fall back to // tgtok::paste interpretation of '#'. bool prepEatPreprocessorDirective(tgtok::TokKind Kind); // The main "exit" point from the token parsing to preprocessor. // // The method is called for CurPtr, when prepIsDirective() returns // true. The first parameter matches the result of prepIsDirective(), // denoting the actual preprocessor directive to be processed. // // If the preprocessing directive disables the tokens processing, e.g.: // #ifdef NAME // NAME is undefined // then lexPreprocessor() enters the lines-skipping mode. // In this mode, it does not parse any tokens, because the code under // the #ifdef may not even be a correct tablegen code. The preprocessor // looks for lines containing other preprocessing directives, which // may be prepended with whitespaces and C-style comments. If the line // does not contain a preprocessing directive, it is skipped completely. // Otherwise, the preprocessing directive is processed by recursively // calling lexPreprocessor(). The processing of the encountered // preprocessing directives includes updating preprocessing control stack // and adding new macros into DefinedMacros set. // // The second parameter controls whether lexPreprocessor() is called from // LexToken() (true) or recursively from lexPreprocessor() (false). // // If ReturnNextLiveToken is true, the method returns the next // LEX token following the current directive or following the end // of the disabled preprocessing region corresponding to this directive. // If ReturnNextLiveToken is false, the method returns the first parameter, // unless there were errors encountered in the disabled preprocessing // region - in this case, it returns tgtok::Error. tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind, bool ReturnNextLiveToken = true); // Worker method for lexPreprocessor() to skip lines after some // preprocessing directive up to the buffer end or to the directive // that re-enables token processing. The method returns true // upon processing the next directive that re-enables tokens // processing. False is returned if an error was encountered. // // Note that prepSkipRegion() calls lexPreprocessor() to process // encountered preprocessing directives. In this case, the second // parameter to lexPreprocessor() is set to false. Being passed // false ReturnNextLiveToken, lexPreprocessor() must never call // prepSkipRegion(). We assert this by passing ReturnNextLiveToken // to prepSkipRegion() and checking that it is never set to false. bool prepSkipRegion(bool MustNeverBeFalse); // Lex name of the macro after either #ifdef or #define. We could have used // LexIdentifier(), but it has special handling of "include" word, which // could result in awkward diagnostic errors. Consider: // ---- // #ifdef include // class ... // ---- // LexIdentifier() will engage LexInclude(), which will complain about // missing file with name "class". Instead, prepLexMacroName() will treat // "include" as a normal macro name. // // On entry, CurPtr points to the end of a preprocessing directive word. // The method allows for whitespaces between the preprocessing directive // and the macro name. The allowed whitespaces are ' ' and '\t'. // // If the first non-whitespace symbol after the preprocessing directive // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then // the method updates TokStart to the position of the first non-whitespace // symbol, sets CurPtr to the position of the macro name's last symbol, // and returns a string reference to the macro name. Otherwise, // TokStart is set to the first non-whitespace symbol after the preprocessing // directive, and the method returns an empty string reference. // // In all cases, TokStart may be used to point to the word following // the preprocessing directive. StringRef prepLexMacroName(); // Skip any whitespaces starting from CurPtr. The method is used // only in the lines-skipping mode to find the first non-whitespace // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n' // and '\r'. The method skips C-style comments as well, because // it is used to find the beginning of the preprocessing directive. // If we do not handle C-style comments the following code would // result in incorrect detection of a preprocessing directive: // /* // #ifdef NAME // */ // As long as we skip C-style comments, the following code is correctly // recognized as a preprocessing directive: // /* first line comment // second line comment */ #ifdef NAME // // The method returns true upon reaching the first non-whitespace symbol // or EOF, CurPtr is set to point to this symbol. The method returns false, // if an error occurred during skipping of a C-style comment. bool prepSkipLineBegin(); // Skip any whitespaces or comments after a preprocessing directive. // The method returns true upon reaching either end of the line // or end of the file. If there is a multiline C-style comment // after the preprocessing directive, the method skips // the comment, so the final CurPtr may point to one of the next lines. // The method returns false, if an error occurred during skipping // C- or C++-style comment, or a non-whitespace symbol appears // after the preprocessing directive. // // The method maybe called both during lines-skipping and tokens // processing. It actually verifies that only whitespaces or/and // comments follow a preprocessing directive. // // After the execution of this mehod, CurPtr points either to new line // symbol, buffer end or non-whitespace symbol following the preprocesing // directive. bool prepSkipDirectiveEnd(); // Skip all symbols to the end of the line/file. // The method adjusts CurPtr, so that it points to either new line // symbol in the current line or the buffer end. void prepSkipToLineEnd(); // Return true, if the current preprocessor control stack is such that // we should allow lexer to process the next token, false - otherwise. // // In particular, the method returns true, if all the #ifdef/#else // controls on the stack have their IsDefined member set to true. bool prepIsProcessingEnabled(); // Report an error, if we reach EOF with non-empty preprocessing control // stack. This means there is no matching #endif for the previous // #ifdef/#else. void prepReportPreprocessorStackError(); }; } // end namespace llvm #endif