xref: /freebsd/contrib/llvm-project/llvm/lib/TableGen/TGLexer.h (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class represents the Lexer for tablegen files.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
14 #define LLVM_LIB_TABLEGEN_TGLEXER_H
15 
16 #include "llvm/ADT/StringRef.h"
17 #include "llvm/ADT/StringSet.h"
18 #include "llvm/Support/DataTypes.h"
19 #include "llvm/Support/SMLoc.h"
20 #include <cassert>
21 #include <memory>
22 #include <set>
23 #include <string>
24 #include <vector>
25 
26 namespace llvm {
27 template <typename T> class ArrayRef;
28 class SourceMgr;
29 class Twine;
30 
31 namespace tgtok {
32 enum TokKind {
33   // Markers
34   Eof,
35   Error,
36 
37   // Tokens with no info.
38   minus,     // -
39   plus,      // +
40   l_square,  // [
41   r_square,  // ]
42   l_brace,   // {
43   r_brace,   // }
44   l_paren,   // (
45   r_paren,   // )
46   less,      // <
47   greater,   // >
48   colon,     // :
49   semi,      // ;
50   comma,     // ,
51   dot,       // .
52   equal,     // =
53   question,  // ?
54   paste,     // #
55   dotdotdot, // ...
56 
57   // Boolean literals.
58   TrueVal,
59   FalseVal,
60 
61   // Integer value.
62   IntVal,
63 
64   // Binary constant.  Note that these are sized according to the number of
65   // bits given.
66   BinaryIntVal,
67 
68   // Preprocessing tokens for internal usage by the lexer.
69   // They are never returned as a result of Lex().
70   Ifdef,
71   Ifndef,
72   Else,
73   Endif,
74   Define,
75 
76   // Reserved keywords. ('ElseKW' is named to distinguish it from the
77   // existing 'Else' that means the preprocessor #else.)
78   Bit,
79   Bits,
80   Code,
81   Dag,
82   ElseKW,
83   FalseKW,
84   Field,
85   In,
86   Include,
87   Int,
88   List,
89   String,
90   Then,
91   TrueKW,
92 
93   // Object start tokens.
94   OBJECT_START_FIRST,
95   Assert = OBJECT_START_FIRST,
96   Class,
97   Def,
98   Defm,
99   Defset,
100   Deftype,
101   Defvar,
102   Dump,
103   Foreach,
104   If,
105   Let,
106   MultiClass,
107   OBJECT_START_LAST = MultiClass,
108 
109   // Bang operators.
110   BANG_OPERATOR_FIRST,
111   XConcat = BANG_OPERATOR_FIRST,
112   XADD,
113   XSUB,
114   XMUL,
115   XDIV,
116   XNOT,
117   XLOG2,
118   XAND,
119   XOR,
120   XXOR,
121   XSRA,
122   XSRL,
123   XSHL,
124   XListConcat,
125   XListSplat,
126   XStrConcat,
127   XInterleave,
128   XSubstr,
129   XFind,
130   XCast,
131   XSubst,
132   XForEach,
133   XFilter,
134   XFoldl,
135   XHead,
136   XTail,
137   XSize,
138   XEmpty,
139   XIf,
140   XCond,
141   XEq,
142   XIsA,
143   XDag,
144   XNe,
145   XLe,
146   XLt,
147   XGe,
148   XGt,
149   XSetDagOp,
150   XGetDagOp,
151   XExists,
152   XListRemove,
153   XToLower,
154   XToUpper,
155   XRange,
156   XGetDagArg,
157   XGetDagName,
158   XSetDagArg,
159   XSetDagName,
160   XRepr,
161   BANG_OPERATOR_LAST = XRepr,
162 
163   // String valued tokens.
164   STRING_VALUE_FIRST,
165   Id = STRING_VALUE_FIRST,
166   StrVal,
167   VarName,
168   CodeFragment,
169   STRING_VALUE_LAST = CodeFragment,
170 };
171 
172 /// isBangOperator - Return true if this is a bang operator.
isBangOperator(tgtok::TokKind Kind)173 static inline bool isBangOperator(tgtok::TokKind Kind) {
174   return tgtok::BANG_OPERATOR_FIRST <= Kind && Kind <= BANG_OPERATOR_LAST;
175 }
176 
177 /// isObjectStart - Return true if this is a valid first token for a statement.
isObjectStart(tgtok::TokKind Kind)178 static inline bool isObjectStart(tgtok::TokKind Kind) {
179   return tgtok::OBJECT_START_FIRST <= Kind && Kind <= OBJECT_START_LAST;
180 }
181 
182 /// isStringValue - Return true if this is a string value.
isStringValue(tgtok::TokKind Kind)183 static inline bool isStringValue(tgtok::TokKind Kind) {
184   return tgtok::STRING_VALUE_FIRST <= Kind && Kind <= STRING_VALUE_LAST;
185 }
186 } // namespace tgtok
187 
188 /// TGLexer - TableGen Lexer class.
189 class TGLexer {
190   SourceMgr &SrcMgr;
191 
192   const char *CurPtr = nullptr;
193   StringRef CurBuf;
194 
195   // Information about the current token.
196   const char *TokStart = nullptr;
197   tgtok::TokKind CurCode = tgtok::TokKind::Eof;
198   std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment
199   int64_t CurIntVal = 0; // This is valid for IntVal.
200 
201   /// CurBuffer - This is the current buffer index we're lexing from as managed
202   /// by the SourceMgr object.
203   unsigned CurBuffer = 0;
204 
205 public:
206   typedef std::set<std::string> DependenciesSetTy;
207 
208 private:
209   /// Dependencies - This is the list of all included files.
210   DependenciesSetTy Dependencies;
211 
212 public:
213   TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
214 
Lex()215   tgtok::TokKind Lex() {
216     return CurCode = LexToken(CurPtr == CurBuf.begin());
217   }
218 
getDependencies()219   const DependenciesSetTy &getDependencies() const {
220     return Dependencies;
221   }
222 
getCode()223   tgtok::TokKind getCode() const { return CurCode; }
224 
getCurStrVal()225   const std::string &getCurStrVal() const {
226     assert(tgtok::isStringValue(CurCode) &&
227            "This token doesn't have a string value");
228     return CurStrVal;
229   }
getCurIntVal()230   int64_t getCurIntVal() const {
231     assert(CurCode == tgtok::IntVal && "This token isn't an integer");
232     return CurIntVal;
233   }
getCurBinaryIntVal()234   std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
235     assert(CurCode == tgtok::BinaryIntVal &&
236            "This token isn't a binary integer");
237     return std::make_pair(CurIntVal, (CurPtr - TokStart)-2);
238   }
239 
240   SMLoc getLoc() const;
241   SMRange getLocRange() const;
242 
243 private:
244   /// LexToken - Read the next token and return its code.
245   tgtok::TokKind LexToken(bool FileOrLineStart = false);
246 
247   tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
248   tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
249 
250   int getNextChar();
251   int peekNextChar(int Index) const;
252   void SkipBCPLComment();
253   bool SkipCComment();
254   tgtok::TokKind LexIdentifier();
255   bool LexInclude();
256   tgtok::TokKind LexString();
257   tgtok::TokKind LexVarName();
258   tgtok::TokKind LexNumber();
259   tgtok::TokKind LexBracket();
260   tgtok::TokKind LexExclaim();
261 
262   // Process EOF encountered in LexToken().
263   // If EOF is met in an include file, then the method will update
264   // CurPtr, CurBuf and preprocessing include stack, and return true.
265   // If EOF is met in the top-level file, then the method will
266   // update and check the preprocessing include stack, and return false.
267   bool processEOF();
268 
269   // *** Structures and methods for preprocessing support ***
270 
271   // A set of macro names that are defined either via command line or
272   // by using:
273   //     #define NAME
274   StringSet<> DefinedMacros;
275 
276   // Each of #ifdef and #else directives has a descriptor associated
277   // with it.
278   //
279   // An ordered list of preprocessing controls defined by #ifdef/#else
280   // directives that are in effect currently is called preprocessing
281   // control stack.  It is represented as a vector of PreprocessorControlDesc's.
282   //
283   // The control stack is updated according to the following rules:
284   //
285   // For each #ifdef we add an element to the control stack.
286   // For each #else we replace the top element with a descriptor
287   // with an inverted IsDefined value.
288   // For each #endif we pop the top element from the control stack.
289   //
290   // When CurPtr reaches the current buffer's end, the control stack
291   // must be empty, i.e. #ifdef and the corresponding #endif
292   // must be located in the same file.
293   struct PreprocessorControlDesc {
294     // Either tgtok::Ifdef or tgtok::Else.
295     tgtok::TokKind Kind;
296 
297     // True, if the condition for this directive is true, false - otherwise.
298     // Examples:
299     //     #ifdef NAME       : true, if NAME is defined, false - otherwise.
300     //     ...
301     //     #else             : false, if NAME is defined, true - otherwise.
302     bool IsDefined;
303 
304     // Pointer into CurBuf to the beginning of the preprocessing directive
305     // word, e.g.:
306     //     #ifdef NAME
307     //      ^ - SrcPos
308     SMLoc SrcPos;
309   };
310 
311   // We want to disallow code like this:
312   //     file1.td:
313   //         #define NAME
314   //         #ifdef NAME
315   //         include "file2.td"
316   //     EOF
317   //     file2.td:
318   //         #endif
319   //     EOF
320   //
321   // To do this, we clear the preprocessing control stack on entry
322   // to each of the included file.  PrepIncludeStack is used to store
323   // preprocessing control stacks for the current file and all its
324   // parent files.  The back() element is the preprocessing control
325   // stack for the current file.
326   std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>>
327       PrepIncludeStack;
328 
329   // Validate that the current preprocessing control stack is empty,
330   // since we are about to exit a file, and pop the include stack.
331   //
332   // If IncludeStackMustBeEmpty is true, the include stack must be empty
333   // after the popping, otherwise, the include stack must not be empty
334   // after the popping.  Basically, the include stack must be empty
335   // only if we exit the "top-level" file (i.e. finish lexing).
336   //
337   // The method returns false, if the current preprocessing control stack
338   // is not empty (e.g. there is an unterminated #ifdef/#else),
339   // true - otherwise.
340   bool prepExitInclude(bool IncludeStackMustBeEmpty);
341 
342   // Look ahead for a preprocessing directive starting from CurPtr.  The caller
343   // must only call this method, if *(CurPtr - 1) is '#'.  If the method matches
344   // a preprocessing directive word followed by a whitespace, then it returns
345   // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
346   //
347   // CurPtr is not adjusted by this method.
348   tgtok::TokKind prepIsDirective() const;
349 
350   // Given a preprocessing token kind, adjusts CurPtr to the end
351   // of the preprocessing directive word.  Returns true, unless
352   // an unsupported token kind is passed in.
353   //
354   // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
355   // to avoid adjusting CurPtr before we are sure that '#' is followed
356   // by a preprocessing directive.  If it is not, then we fall back to
357   // tgtok::paste interpretation of '#'.
358   bool prepEatPreprocessorDirective(tgtok::TokKind Kind);
359 
360   // The main "exit" point from the token parsing to preprocessor.
361   //
362   // The method is called for CurPtr, when prepIsDirective() returns
363   // true.  The first parameter matches the result of prepIsDirective(),
364   // denoting the actual preprocessor directive to be processed.
365   //
366   // If the preprocessing directive disables the tokens processing, e.g.:
367   //     #ifdef NAME // NAME is undefined
368   // then lexPreprocessor() enters the lines-skipping mode.
369   // In this mode, it does not parse any tokens, because the code under
370   // the #ifdef may not even be a correct tablegen code.  The preprocessor
371   // looks for lines containing other preprocessing directives, which
372   // may be prepended with whitespaces and C-style comments.  If the line
373   // does not contain a preprocessing directive, it is skipped completely.
374   // Otherwise, the preprocessing directive is processed by recursively
375   // calling lexPreprocessor().  The processing of the encountered
376   // preprocessing directives includes updating preprocessing control stack
377   // and adding new macros into DefinedMacros set.
378   //
379   // The second parameter controls whether lexPreprocessor() is called from
380   // LexToken() (true) or recursively from lexPreprocessor() (false).
381   //
382   // If ReturnNextLiveToken is true, the method returns the next
383   // LEX token following the current directive or following the end
384   // of the disabled preprocessing region corresponding to this directive.
385   // If ReturnNextLiveToken is false, the method returns the first parameter,
386   // unless there were errors encountered in the disabled preprocessing
387   // region - in this case, it returns tgtok::Error.
388   tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
389                                  bool ReturnNextLiveToken = true);
390 
391   // Worker method for lexPreprocessor() to skip lines after some
392   // preprocessing directive up to the buffer end or to the directive
393   // that re-enables token processing.  The method returns true
394   // upon processing the next directive that re-enables tokens
395   // processing.  False is returned if an error was encountered.
396   //
397   // Note that prepSkipRegion() calls lexPreprocessor() to process
398   // encountered preprocessing directives.  In this case, the second
399   // parameter to lexPreprocessor() is set to false.  Being passed
400   // false ReturnNextLiveToken, lexPreprocessor() must never call
401   // prepSkipRegion().  We assert this by passing ReturnNextLiveToken
402   // to prepSkipRegion() and checking that it is never set to false.
403   bool prepSkipRegion(bool MustNeverBeFalse);
404 
405   // Lex name of the macro after either #ifdef or #define.  We could have used
406   // LexIdentifier(), but it has special handling of "include" word, which
407   // could result in awkward diagnostic errors.  Consider:
408   // ----
409   // #ifdef include
410   // class ...
411   // ----
412   // LexIdentifier() will engage LexInclude(), which will complain about
413   // missing file with name "class".  Instead, prepLexMacroName() will treat
414   // "include" as a normal macro name.
415   //
416   // On entry, CurPtr points to the end of a preprocessing directive word.
417   // The method allows for whitespaces between the preprocessing directive
418   // and the macro name.  The allowed whitespaces are ' ' and '\t'.
419   //
420   // If the first non-whitespace symbol after the preprocessing directive
421   // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
422   // the method updates TokStart to the position of the first non-whitespace
423   // symbol, sets CurPtr to the position of the macro name's last symbol,
424   // and returns a string reference to the macro name.  Otherwise,
425   // TokStart is set to the first non-whitespace symbol after the preprocessing
426   // directive, and the method returns an empty string reference.
427   //
428   // In all cases, TokStart may be used to point to the word following
429   // the preprocessing directive.
430   StringRef prepLexMacroName();
431 
432   // Skip any whitespaces starting from CurPtr.  The method is used
433   // only in the lines-skipping mode to find the first non-whitespace
434   // symbol after or at CurPtr.  Allowed whitespaces are ' ', '\t', '\n'
435   // and '\r'.  The method skips C-style comments as well, because
436   // it is used to find the beginning of the preprocessing directive.
437   // If we do not handle C-style comments the following code would
438   // result in incorrect detection of a preprocessing directive:
439   //     /*
440   //     #ifdef NAME
441   //     */
442   // As long as we skip C-style comments, the following code is correctly
443   // recognized as a preprocessing directive:
444   //     /* first line comment
445   //        second line comment */ #ifdef NAME
446   //
447   // The method returns true upon reaching the first non-whitespace symbol
448   // or EOF, CurPtr is set to point to this symbol.  The method returns false,
449   // if an error occurred during skipping of a C-style comment.
450   bool prepSkipLineBegin();
451 
452   // Skip any whitespaces or comments after a preprocessing directive.
453   // The method returns true upon reaching either end of the line
454   // or end of the file.  If there is a multiline C-style comment
455   // after the preprocessing directive, the method skips
456   // the comment, so the final CurPtr may point to one of the next lines.
457   // The method returns false, if an error occurred during skipping
458   // C- or C++-style comment, or a non-whitespace symbol appears
459   // after the preprocessing directive.
460   //
461   // The method maybe called both during lines-skipping and tokens
462   // processing.  It actually verifies that only whitespaces or/and
463   // comments follow a preprocessing directive.
464   //
465   // After the execution of this mehod, CurPtr points either to new line
466   // symbol, buffer end or non-whitespace symbol following the preprocesing
467   // directive.
468   bool prepSkipDirectiveEnd();
469 
470   // Return true, if the current preprocessor control stack is such that
471   // we should allow lexer to process the next token, false - otherwise.
472   //
473   // In particular, the method returns true, if all the #ifdef/#else
474   // controls on the stack have their IsDefined member set to true.
475   bool prepIsProcessingEnabled();
476 
477   // Report an error, if we reach EOF with non-empty preprocessing control
478   // stack.  This means there is no matching #endif for the previous
479   // #ifdef/#else.
480   void prepReportPreprocessorStackError();
481 };
482 
483 } // end namespace llvm
484 
485 #endif
486