xref: /linux/scripts/genksyms/lex.l (revision 3ccda63a3af5f12c9e0b01c06561285227d2f79c)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Lexical analysis for genksyms.
4  * Copyright 1996, 1997 Linux International.
5  *
6  * New implementation contributed by Richard Henderson <rth@tamu.edu>
7  * Based on original work by Bjorn Ekwall <bj0rn@blox.se>
8  *
9  * Taken from Linux modutils 2.4.22.
10  */
11 
12 %{
13 
14 #include <limits.h>
15 #include <stdbool.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <ctype.h>
19 
20 #include "genksyms.h"
21 #include "parse.tab.h"
22 
23 /* We've got a two-level lexer here.  We let flex do basic tokenization
24    and then we categorize those basic tokens in the second stage.  */
25 #define YY_DECL		static int yylex1(void)
26 
27 %}
28 
29 IDENT			[A-Za-z_\$][A-Za-z0-9_\$]*
30 
31 O_INT			0[0-7]*
32 D_INT			[1-9][0-9]*
33 X_INT			0[Xx][0-9A-Fa-f]+
34 I_SUF			[Uu]|[Ll]|[Uu][Ll]|[Ll][Uu]
35 INT			({O_INT}|{D_INT}|{X_INT}){I_SUF}?
36 
37 FRAC			([0-9]*\.[0-9]+)|([0-9]+\.)
38 EXP			[Ee][+-]?[0-9]+
39 F_SUF			[FfLl]
40 REAL			({FRAC}{EXP}?{F_SUF}?)|([0-9]+{EXP}{F_SUF}?)
41 
42 STRING			L?\"([^\\\"]*\\.)*[^\\\"]*\"
43 CHAR			L?\'([^\\\']*\\.)*[^\\\']*\'
44 
45 MC_TOKEN		([~%^&*+=|<>/-]=)|(&&)|("||")|(->)|(<<)|(>>)
46 
47 /* We don't do multiple input files.  */
48 %option noyywrap
49 
50 %option noinput
51 
52 %%
53 
54 
55  /* Keep track of our location in the original source files.  */
56 ^#[ \t]+{INT}[ \t]+\"[^\"\n]+\".*\n	return FILENAME;
57 ^#.*\n					cur_line++;
58 \n					cur_line++;
59 
60  /* Ignore all other whitespace.  */
61 [ \t\f\v\r]+				;
62 
63 
64 {STRING}				return STRING;
65 {CHAR}					return CHAR;
66 {IDENT}					return IDENT;
67 
68  /* The Pedant requires that the other C multi-character tokens be
69     recognized as tokens.  We don't actually use them since we don't
70     parse expressions, but we do want whitespace to be arranged
71     around them properly.  */
72 {MC_TOKEN}				return OTHER;
73 {INT}					return INT;
74 {REAL}					return REAL;
75 
76 "..."					return DOTS;
77 
78  /* All other tokens are single characters.  */
79 .					return yytext[0];
80 
81 
82 %%
83 
84 /* Bring in the keyword recognizer.  */
85 
86 #include "keywords.c"
87 
88 
89 /* Macros to append to our phrase collection list.  */
90 
91 /*
92  * We mark any token, that that equals to a known enumerator, as
93  * SYM_ENUM_CONST. The parser will change this for struct and union tags later,
94  * the only problem is struct and union members:
95  *    enum e { a, b }; struct s { int a, b; }
96  * but in this case, the only effect will be, that the ABI checksums become
97  * more volatile, which is acceptable. Also, such collisions are quite rare,
98  * so far it was only observed in include/linux/telephony.h.
99  */
100 #define _APP(T,L)	do {						   \
101 			  cur_node = next_node;				   \
102 			  next_node = xmalloc(sizeof(*next_node));	   \
103 			  next_node->next = cur_node;			   \
104 			  cur_node->string = memcpy(xmalloc(L+1), T, L+1); \
105 			  cur_node->tag =				   \
106 			    find_symbol(cur_node->string, SYM_ENUM_CONST, 1)?\
107 			    SYM_ENUM_CONST : SYM_NORMAL ;		   \
108 			  cur_node->in_source_file = in_source_file;       \
109 			} while (0)
110 
111 #define APP		_APP(yytext, yyleng)
112 
113 
114 /* The second stage lexer.  Here we incorporate knowledge of the state
115    of the parser to tailor the tokens that are returned.  */
116 
117 /*
118  * The lexer cannot distinguish whether a typedef'ed string is a TYPE or an
119  * IDENT. We need a hint from the parser to handle this accurately.
120  */
121 bool dont_want_type_specifier;
122 
123 int
124 yylex(void)
125 {
126   static enum {
127     ST_NOTSTARTED, ST_NORMAL, ST_ATTRIBUTE, ST_ASM, ST_TYPEOF, ST_TYPEOF_1,
128     ST_BRACKET, ST_BRACE, ST_EXPRESSION, ST_STATIC_ASSERT,
129   } lexstate = ST_NOTSTARTED;
130 
131   static int suppress_type_lookup, dont_want_brace_phrase;
132   static struct string_list *next_node;
133   static char *source_file;
134 
135   int token, count = 0;
136   struct string_list *cur_node;
137 
138   if (lexstate == ST_NOTSTARTED)
139     {
140       next_node = xmalloc(sizeof(*next_node));
141       next_node->next = NULL;
142       lexstate = ST_NORMAL;
143     }
144 
145 repeat:
146   token = yylex1();
147 
148   if (token == 0)
149     return 0;
150   else if (token == FILENAME)
151     {
152       char *file, *e;
153 
154       /* Save the filename and line number for later error messages.  */
155 
156       if (cur_filename)
157 	free(cur_filename);
158 
159       file = strchr(yytext, '\"')+1;
160       e = strchr(file, '\"');
161       *e = '\0';
162       cur_filename = memcpy(xmalloc(e-file+1), file, e-file+1);
163       cur_line = atoi(yytext+2);
164 
165       if (!source_file) {
166         source_file = xstrdup(cur_filename);
167         in_source_file = 1;
168       } else {
169         in_source_file = (strcmp(cur_filename, source_file) == 0);
170       }
171 
172       goto repeat;
173     }
174 
175   switch (lexstate)
176     {
177     case ST_NORMAL:
178       switch (token)
179 	{
180 	case IDENT:
181 	  APP;
182 	  {
183 	    int r = is_reserved_word(yytext, yyleng);
184 	    if (r >= 0)
185 	      {
186 		switch (token = r)
187 		  {
188 		  case ATTRIBUTE_KEYW:
189 		    lexstate = ST_ATTRIBUTE;
190 		    count = 0;
191 		    goto repeat;
192 		  case ASM_KEYW:
193 		    lexstate = ST_ASM;
194 		    count = 0;
195 		    goto repeat;
196 		  case TYPEOF_KEYW:
197 		    lexstate = ST_TYPEOF;
198 		    count = 0;
199 		    goto repeat;
200 
201 		  case STRUCT_KEYW:
202 		  case UNION_KEYW:
203 		  case ENUM_KEYW:
204 		    dont_want_brace_phrase = 3;
205 		    suppress_type_lookup = 2;
206 		    goto fini;
207 
208 		  case EXPORT_SYMBOL_KEYW:
209 		      goto fini;
210 
211 		  case STATIC_ASSERT_KEYW:
212 		    lexstate = ST_STATIC_ASSERT;
213 		    count = 0;
214 		    goto repeat;
215 		  }
216 	      }
217 	    if (!suppress_type_lookup && !dont_want_type_specifier)
218 	      {
219 		if (find_symbol(yytext, SYM_TYPEDEF, 1))
220 		  token = TYPE;
221 	      }
222 	  }
223 	  break;
224 
225 	case '[':
226 	  APP;
227 	  lexstate = ST_BRACKET;
228 	  count = 1;
229 	  goto repeat;
230 
231 	case '{':
232 	  APP;
233 	  if (dont_want_brace_phrase)
234 	    break;
235 	  lexstate = ST_BRACE;
236 	  count = 1;
237 	  goto repeat;
238 
239 	case '=': case ':':
240 	  APP;
241 	  lexstate = ST_EXPRESSION;
242 	  break;
243 
244 	default:
245 	  APP;
246 	  break;
247 	}
248       break;
249 
250     case ST_ATTRIBUTE:
251       APP;
252       switch (token)
253 	{
254 	case '(':
255 	  ++count;
256 	  goto repeat;
257 	case ')':
258 	  if (--count == 0)
259 	    {
260 	      lexstate = ST_NORMAL;
261 	      token = ATTRIBUTE_PHRASE;
262 	      break;
263 	    }
264 	  goto repeat;
265 	default:
266 	  goto repeat;
267 	}
268       break;
269 
270     case ST_ASM:
271       APP;
272       switch (token)
273 	{
274 	case '(':
275 	  ++count;
276 	  goto repeat;
277 	case ')':
278 	  if (--count == 0)
279 	    {
280 	      lexstate = ST_NORMAL;
281 	      token = ASM_PHRASE;
282 	      break;
283 	    }
284 	  goto repeat;
285 	default:
286 	  goto repeat;
287 	}
288       break;
289 
290     case ST_TYPEOF_1:
291       if (token == IDENT)
292 	{
293 	  if (is_reserved_word(yytext, yyleng) >= 0
294 	      || find_symbol(yytext, SYM_TYPEDEF, 1))
295 	    {
296 	      yyless(0);
297 	      unput('(');
298 	      lexstate = ST_NORMAL;
299 	      token = TYPEOF_KEYW;
300 	      break;
301 	    }
302 	  _APP("(", 1);
303 	}
304 	lexstate = ST_TYPEOF;
305 	/* FALLTHRU */
306 
307     case ST_TYPEOF:
308       switch (token)
309 	{
310 	case '(':
311 	  if ( ++count == 1 )
312 	    lexstate = ST_TYPEOF_1;
313 	  else
314 	    APP;
315 	  goto repeat;
316 	case ')':
317 	  APP;
318 	  if (--count == 0)
319 	    {
320 	      lexstate = ST_NORMAL;
321 	      token = TYPEOF_PHRASE;
322 	      break;
323 	    }
324 	  goto repeat;
325 	default:
326 	  APP;
327 	  goto repeat;
328 	}
329       break;
330 
331     case ST_BRACKET:
332       APP;
333       switch (token)
334 	{
335 	case '[':
336 	  ++count;
337 	  goto repeat;
338 	case ']':
339 	  if (--count == 0)
340 	    {
341 	      lexstate = ST_NORMAL;
342 	      token = BRACKET_PHRASE;
343 	      break;
344 	    }
345 	  goto repeat;
346 	default:
347 	  goto repeat;
348 	}
349       break;
350 
351     case ST_BRACE:
352       APP;
353       switch (token)
354 	{
355 	case '{':
356 	  ++count;
357 	  goto repeat;
358 	case '}':
359 	  if (--count == 0)
360 	    {
361 	      lexstate = ST_NORMAL;
362 	      token = BRACE_PHRASE;
363 	      break;
364 	    }
365 	  goto repeat;
366 	default:
367 	  goto repeat;
368 	}
369       break;
370 
371     case ST_EXPRESSION:
372       switch (token)
373 	{
374 	case '(': case '[': case '{':
375 	  ++count;
376 	  APP;
377 	  goto repeat;
378 	case '}':
379 	  /* is this the last line of an enum declaration? */
380 	  if (count == 0)
381 	    {
382 	      /* Put back the token we just read so's we can find it again
383 		 after registering the expression.  */
384 	      unput(token);
385 
386 	      lexstate = ST_NORMAL;
387 	      token = EXPRESSION_PHRASE;
388 	      break;
389 	    }
390 	  /* FALLTHRU */
391 	case ')': case ']':
392 	  --count;
393 	  APP;
394 	  goto repeat;
395 	case ',': case ';':
396 	  if (count == 0)
397 	    {
398 	      /* Put back the token we just read so's we can find it again
399 		 after registering the expression.  */
400 	      unput(token);
401 
402 	      lexstate = ST_NORMAL;
403 	      token = EXPRESSION_PHRASE;
404 	      break;
405 	    }
406 	  APP;
407 	  goto repeat;
408 	default:
409 	  APP;
410 	  goto repeat;
411 	}
412       break;
413 
414     case ST_STATIC_ASSERT:
415       APP;
416       switch (token)
417 	{
418 	case '(':
419 	  ++count;
420 	  goto repeat;
421 	case ')':
422 	  if (--count == 0)
423 	    {
424 	      lexstate = ST_NORMAL;
425 	      token = STATIC_ASSERT_PHRASE;
426 	      break;
427 	    }
428 	  goto repeat;
429 	default:
430 	  goto repeat;
431 	}
432       break;
433 
434     default:
435       exit(1);
436     }
437 fini:
438 
439   if (suppress_type_lookup > 0)
440     --suppress_type_lookup;
441   if (dont_want_brace_phrase > 0)
442     --dont_want_brace_phrase;
443 
444   yylval = &next_node->next;
445 
446   return token;
447 }
448