xref: /linux/scripts/genksyms/lex.l (revision c8bfe3fad4f86a029da7157bae9699c816f0c309)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Lexical analysis for genksyms.
4  * Copyright 1996, 1997 Linux International.
5  *
6  * New implementation contributed by Richard Henderson <rth@tamu.edu>
7  * Based on original work by Bjorn Ekwall <bj0rn@blox.se>
8  *
9  * Taken from Linux modutils 2.4.22.
10  */
11 
12 %{
13 
14 #include <limits.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <ctype.h>
18 
19 #include "genksyms.h"
20 #include "parse.tab.h"
21 
22 /* We've got a two-level lexer here.  We let flex do basic tokenization
23    and then we categorize those basic tokens in the second stage.  */
24 #define YY_DECL		static int yylex1(void)
25 
26 %}
27 
28 IDENT			[A-Za-z_\$][A-Za-z0-9_\$]*
29 
30 O_INT			0[0-7]*
31 D_INT			[1-9][0-9]*
32 X_INT			0[Xx][0-9A-Fa-f]+
33 I_SUF			[Uu]|[Ll]|[Uu][Ll]|[Ll][Uu]
34 INT			({O_INT}|{D_INT}|{X_INT}){I_SUF}?
35 
36 FRAC			([0-9]*\.[0-9]+)|([0-9]+\.)
37 EXP			[Ee][+-]?[0-9]+
38 F_SUF			[FfLl]
39 REAL			({FRAC}{EXP}?{F_SUF}?)|([0-9]+{EXP}{F_SUF}?)
40 
41 STRING			L?\"([^\\\"]*\\.)*[^\\\"]*\"
42 CHAR			L?\'([^\\\']*\\.)*[^\\\']*\'
43 
44 MC_TOKEN		([~%^&*+=|<>/-]=)|(&&)|("||")|(->)|(<<)|(>>)
45 
46 /* We don't do multiple input files.  */
47 %option noyywrap
48 
49 %option noinput
50 
51 %%
52 
53 
54  /* Keep track of our location in the original source files.  */
55 ^#[ \t]+{INT}[ \t]+\"[^\"\n]+\".*\n	return FILENAME;
56 ^#.*\n					cur_line++;
57 \n					cur_line++;
58 
59  /* Ignore all other whitespace.  */
60 [ \t\f\v\r]+				;
61 
62 
63 {STRING}				return STRING;
64 {CHAR}					return CHAR;
65 {IDENT}					return IDENT;
66 
67  /* The Pedant requires that the other C multi-character tokens be
68     recognized as tokens.  We don't actually use them since we don't
69     parse expressions, but we do want whitespace to be arranged
70     around them properly.  */
71 {MC_TOKEN}				return OTHER;
72 {INT}					return INT;
73 {REAL}					return REAL;
74 
75 "..."					return DOTS;
76 
77  /* All other tokens are single characters.  */
78 .					return yytext[0];
79 
80 
81 %%
82 
83 /* Bring in the keyword recognizer.  */
84 
85 #include "keywords.c"
86 
87 
88 /* Macros to append to our phrase collection list.  */
89 
90 /*
91  * We mark any token, that that equals to a known enumerator, as
92  * SYM_ENUM_CONST. The parser will change this for struct and union tags later,
93  * the only problem is struct and union members:
94  *    enum e { a, b }; struct s { int a, b; }
95  * but in this case, the only effect will be, that the ABI checksums become
96  * more volatile, which is acceptable. Also, such collisions are quite rare,
97  * so far it was only observed in include/linux/telephony.h.
98  */
99 #define _APP(T,L)	do {						   \
100 			  cur_node = next_node;				   \
101 			  next_node = xmalloc(sizeof(*next_node));	   \
102 			  next_node->next = cur_node;			   \
103 			  cur_node->string = memcpy(xmalloc(L+1), T, L+1); \
104 			  cur_node->tag =				   \
105 			    find_symbol(cur_node->string, SYM_ENUM_CONST, 1)?\
106 			    SYM_ENUM_CONST : SYM_NORMAL ;		   \
107 			  cur_node->in_source_file = in_source_file;       \
108 			} while (0)
109 
110 #define APP		_APP(yytext, yyleng)
111 
112 
113 /* The second stage lexer.  Here we incorporate knowledge of the state
114    of the parser to tailor the tokens that are returned.  */
115 
116 int
117 yylex(void)
118 {
119   static enum {
120     ST_NOTSTARTED, ST_NORMAL, ST_ATTRIBUTE, ST_ASM, ST_TYPEOF, ST_TYPEOF_1,
121     ST_BRACKET, ST_BRACE, ST_EXPRESSION, ST_STATIC_ASSERT,
122   } lexstate = ST_NOTSTARTED;
123 
124   static int suppress_type_lookup, dont_want_brace_phrase;
125   static struct string_list *next_node;
126   static char *source_file;
127 
128   int token, count = 0;
129   struct string_list *cur_node;
130 
131   if (lexstate == ST_NOTSTARTED)
132     {
133       next_node = xmalloc(sizeof(*next_node));
134       next_node->next = NULL;
135       lexstate = ST_NORMAL;
136     }
137 
138 repeat:
139   token = yylex1();
140 
141   if (token == 0)
142     return 0;
143   else if (token == FILENAME)
144     {
145       char *file, *e;
146 
147       /* Save the filename and line number for later error messages.  */
148 
149       if (cur_filename)
150 	free(cur_filename);
151 
152       file = strchr(yytext, '\"')+1;
153       e = strchr(file, '\"');
154       *e = '\0';
155       cur_filename = memcpy(xmalloc(e-file+1), file, e-file+1);
156       cur_line = atoi(yytext+2);
157 
158       if (!source_file) {
159         source_file = xstrdup(cur_filename);
160         in_source_file = 1;
161       } else {
162         in_source_file = (strcmp(cur_filename, source_file) == 0);
163       }
164 
165       goto repeat;
166     }
167 
168   switch (lexstate)
169     {
170     case ST_NORMAL:
171       switch (token)
172 	{
173 	case IDENT:
174 	  APP;
175 	  {
176 	    int r = is_reserved_word(yytext, yyleng);
177 	    if (r >= 0)
178 	      {
179 		switch (token = r)
180 		  {
181 		  case ATTRIBUTE_KEYW:
182 		    lexstate = ST_ATTRIBUTE;
183 		    count = 0;
184 		    goto repeat;
185 		  case ASM_KEYW:
186 		    lexstate = ST_ASM;
187 		    count = 0;
188 		    goto repeat;
189 		  case TYPEOF_KEYW:
190 		    lexstate = ST_TYPEOF;
191 		    count = 0;
192 		    goto repeat;
193 
194 		  case STRUCT_KEYW:
195 		  case UNION_KEYW:
196 		  case ENUM_KEYW:
197 		    dont_want_brace_phrase = 3;
198 		    suppress_type_lookup = 2;
199 		    goto fini;
200 
201 		  case EXPORT_SYMBOL_KEYW:
202 		      goto fini;
203 
204 		  case STATIC_ASSERT_KEYW:
205 		    lexstate = ST_STATIC_ASSERT;
206 		    count = 0;
207 		    goto repeat;
208 		  }
209 	      }
210 	    if (!suppress_type_lookup)
211 	      {
212 		if (find_symbol(yytext, SYM_TYPEDEF, 1))
213 		  token = TYPE;
214 	      }
215 	  }
216 	  break;
217 
218 	case '[':
219 	  APP;
220 	  lexstate = ST_BRACKET;
221 	  count = 1;
222 	  goto repeat;
223 
224 	case '{':
225 	  APP;
226 	  if (dont_want_brace_phrase)
227 	    break;
228 	  lexstate = ST_BRACE;
229 	  count = 1;
230 	  goto repeat;
231 
232 	case '=': case ':':
233 	  APP;
234 	  lexstate = ST_EXPRESSION;
235 	  break;
236 
237 	default:
238 	  APP;
239 	  break;
240 	}
241       break;
242 
243     case ST_ATTRIBUTE:
244       APP;
245       switch (token)
246 	{
247 	case '(':
248 	  ++count;
249 	  goto repeat;
250 	case ')':
251 	  if (--count == 0)
252 	    {
253 	      lexstate = ST_NORMAL;
254 	      token = ATTRIBUTE_PHRASE;
255 	      break;
256 	    }
257 	  goto repeat;
258 	default:
259 	  goto repeat;
260 	}
261       break;
262 
263     case ST_ASM:
264       APP;
265       switch (token)
266 	{
267 	case '(':
268 	  ++count;
269 	  goto repeat;
270 	case ')':
271 	  if (--count == 0)
272 	    {
273 	      lexstate = ST_NORMAL;
274 	      token = ASM_PHRASE;
275 	      break;
276 	    }
277 	  goto repeat;
278 	default:
279 	  goto repeat;
280 	}
281       break;
282 
283     case ST_TYPEOF_1:
284       if (token == IDENT)
285 	{
286 	  if (is_reserved_word(yytext, yyleng) >= 0
287 	      || find_symbol(yytext, SYM_TYPEDEF, 1))
288 	    {
289 	      yyless(0);
290 	      unput('(');
291 	      lexstate = ST_NORMAL;
292 	      token = TYPEOF_KEYW;
293 	      break;
294 	    }
295 	  _APP("(", 1);
296 	}
297 	lexstate = ST_TYPEOF;
298 	/* FALLTHRU */
299 
300     case ST_TYPEOF:
301       switch (token)
302 	{
303 	case '(':
304 	  if ( ++count == 1 )
305 	    lexstate = ST_TYPEOF_1;
306 	  else
307 	    APP;
308 	  goto repeat;
309 	case ')':
310 	  APP;
311 	  if (--count == 0)
312 	    {
313 	      lexstate = ST_NORMAL;
314 	      token = TYPEOF_PHRASE;
315 	      break;
316 	    }
317 	  goto repeat;
318 	default:
319 	  APP;
320 	  goto repeat;
321 	}
322       break;
323 
324     case ST_BRACKET:
325       APP;
326       switch (token)
327 	{
328 	case '[':
329 	  ++count;
330 	  goto repeat;
331 	case ']':
332 	  if (--count == 0)
333 	    {
334 	      lexstate = ST_NORMAL;
335 	      token = BRACKET_PHRASE;
336 	      break;
337 	    }
338 	  goto repeat;
339 	default:
340 	  goto repeat;
341 	}
342       break;
343 
344     case ST_BRACE:
345       APP;
346       switch (token)
347 	{
348 	case '{':
349 	  ++count;
350 	  goto repeat;
351 	case '}':
352 	  if (--count == 0)
353 	    {
354 	      lexstate = ST_NORMAL;
355 	      token = BRACE_PHRASE;
356 	      break;
357 	    }
358 	  goto repeat;
359 	default:
360 	  goto repeat;
361 	}
362       break;
363 
364     case ST_EXPRESSION:
365       switch (token)
366 	{
367 	case '(': case '[': case '{':
368 	  ++count;
369 	  APP;
370 	  goto repeat;
371 	case '}':
372 	  /* is this the last line of an enum declaration? */
373 	  if (count == 0)
374 	    {
375 	      /* Put back the token we just read so's we can find it again
376 		 after registering the expression.  */
377 	      unput(token);
378 
379 	      lexstate = ST_NORMAL;
380 	      token = EXPRESSION_PHRASE;
381 	      break;
382 	    }
383 	  /* FALLTHRU */
384 	case ')': case ']':
385 	  --count;
386 	  APP;
387 	  goto repeat;
388 	case ',': case ';':
389 	  if (count == 0)
390 	    {
391 	      /* Put back the token we just read so's we can find it again
392 		 after registering the expression.  */
393 	      unput(token);
394 
395 	      lexstate = ST_NORMAL;
396 	      token = EXPRESSION_PHRASE;
397 	      break;
398 	    }
399 	  APP;
400 	  goto repeat;
401 	default:
402 	  APP;
403 	  goto repeat;
404 	}
405       break;
406 
407     case ST_STATIC_ASSERT:
408       APP;
409       switch (token)
410 	{
411 	case '(':
412 	  ++count;
413 	  goto repeat;
414 	case ')':
415 	  if (--count == 0)
416 	    {
417 	      lexstate = ST_NORMAL;
418 	      token = STATIC_ASSERT_PHRASE;
419 	      break;
420 	    }
421 	  goto repeat;
422 	default:
423 	  goto repeat;
424 	}
425       break;
426 
427     default:
428       exit(1);
429     }
430 fini:
431 
432   if (suppress_type_lookup > 0)
433     --suppress_type_lookup;
434   if (dont_want_brace_phrase > 0)
435     --dont_want_brace_phrase;
436 
437   yylval = &next_node->next;
438 
439   return token;
440 }
441