xref: /illumos-gate/usr/src/tools/ndrgen/ndr_lex.c (revision 56b56c0dc63eac41299ada6dcb890406f9063b1c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <errno.h>
28 #include <stdarg.h>
29 #include "ndrgen.h"
30 #include "y.tab.h"
31 
32 /*
33  * C-like lexical analysis.
34  *
35  * 1. Define a "struct node"
36  * 2. Define a "struct symbol" that encapsulates a struct node.
37  * 3. Define a "struct integer" that encapsulates a struct node.
38  * 4. Set the YACC stack type in the grammar:
39  *		%{
40  *		#define YYSTYPE struct node *
41  *		%}
42  * 5. Define %token's in the grammer for IDENTIFIER, STRING and INTEGER.
43  *    Using "_KW" as a suffix for keyword tokens, i.e. "struct" is
44  *    "%token STRUCT_KW":
45  *	// atomic values
46  *	%token INTEGER STRING IDENTIFIER
47  *	// keywords
48  *	%token STRUCT_KW CASE_KW
49  *	// operators
50  *	%token PLUS MINUS ASSIGN ARROW
51  *	// overloaded tokens (++ --, < > <= >=, == !=, += -= *= ...)
52  *	%token INCOP RELOP EQUOP ASSOP
53  * 6. It's easiest to use the yacc(1) generated token numbers for node
54  *    labels.  For node labels that are not actually part of the grammer,
55  *    use a %token with an L_ prefix:
56  *	// node labels (can't be generated by lex)
57  *	%token L_LT L_LTE L_GT L_GTE L_EQU L_NEQ
58  * 7. Call set_lex_input() before parsing.
59  */
60 
61 #define	SQ	'\''
62 #define	DQ	'"'
63 
64 #define	isquote(c) ((c) == SQ || (c) == DQ)
65 #define	iswhite(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\f')
66 
67 #define	is_between(c, l, u)  ((l) <= (c) && (c) <= (u))
68 #define	is_white(c)	((c) == ' ' || c == '\r' || c == '\t' || c == '\f')
69 #define	is_lower(c)	is_between((c), 'a', 'z')
70 #define	is_upper(c)	is_between((c), 'A', 'Z')
71 #define	is_alpha(c)	(is_lower(c) || is_upper(c))
72 #define	is_digit(c)	is_between((c), '0', '9')
73 #define	is_sstart(c)	(is_alpha(c) || (c) == '_')
74 #define	is_sfollow(c)	(is_sstart(c) || is_digit(c))
75 #define	is_xdigit(c)	\
76 	(is_digit(c) || is_between((c), 'A', 'F') || is_between((c), 'a', 'f'))
77 
78 ndr_symbol_t		*symbol_list;
79 static ndr_integer_t	*integer_list;
80 static FILE		*lex_infp;
81 static ndr_symbol_t	*file_name;
82 int			line_number;
83 int			n_compile_error;
84 
85 static int		lex_at_bol;
86 
87 /* In yacc(1) generated parser */
88 extern struct node	*yylval;
89 
90 /*
91  * The keywtab[] and optable[] could be external to this lex
92  * and it would all still work.
93  */
94 static ndr_keyword_t keywtable[] = {
95 	{ "struct",	STRUCT_KW,	0 },
96 	{ "union",	UNION_KW,	0 },
97 	{ "typedef",	TYPEDEF_KW,	0 },
98 
99 	{ "interface",	INTERFACE_KW,	0 },
100 	{ "uuid",	UUID_KW,	0 },
101 	{ "_no_reorder", _NO_REORDER_KW, 0 },
102 	{ "extern",	EXTERN_KW,	0 },
103 	{ "reference",	REFERENCE_KW,	0 },
104 
105 	{ "align",	ALIGN_KW,	0 },
106 	{ "operation",	OPERATION_KW,	0 },
107 	{ "in",		IN_KW,		0 },
108 	{ "out",	OUT_KW,		0 },
109 
110 	{ "string",	STRING_KW,	0 },
111 	{ "size_is",	SIZE_IS_KW,	0 },
112 	{ "length_is",	LENGTH_IS_KW,	0 },
113 
114 	{ "switch_is",	SWITCH_IS_KW,	0 },
115 	{ "case",	CASE_KW,	0 },
116 	{ "default",	DEFAULT_KW,	0 },
117 
118 	{ "transmit_as", TRANSMIT_AS_KW, 0 },
119 	{ "arg_is",	ARG_IS_KW,	0 },
120 
121 	{ "char",	BASIC_TYPE,	1 },
122 	{ "uchar",	BASIC_TYPE,	1 },
123 	{ "wchar",	BASIC_TYPE,	2 },
124 	{ "short",	BASIC_TYPE,	2 },
125 	{ "ushort",	BASIC_TYPE,	2 },
126 	{ "long",	BASIC_TYPE,	4 },
127 	{ "ulong",	BASIC_TYPE,	4 },
128 	{0}
129 };
130 
131 static ndr_keyword_t optable[] = {
132 	{ "{",		LC,		0 },
133 	{ "}",		RC,		0 },
134 	{ "(",		LP,		0 },
135 	{ ")",		RP,		0 },
136 	{ "[",		LB,		0 },
137 	{ "]",		RB,		0 },
138 	{ "*",		STAR,		0 },
139 	{ "/",		DIV,		0 },
140 	{ "%",		MOD,		0 },
141 	{ "-",		MINUS,		0 },
142 	{ "+",		PLUS,		0 },
143 	{ "&",		AND,		0 },
144 	{ "|",		OR,		0 },
145 	{ "^",		XOR,		0 },
146 	{ ";",		SEMI,		0 },
147 	{0}
148 };
149 
150 static int getch(FILE *fp);
151 static ndr_integer_t *int_enter(long);
152 static ndr_symbol_t *sym_enter(char *);
153 static ndr_symbol_t *sym_find(char *);
154 static int str_to_sv(char *, char *sv[]);
155 
156 /*
157  * Enter the symbols for keyword.
158  */
159 static void
160 keyw_tab_init(ndr_keyword_t kwtable[])
161 {
162 	int			i;
163 	ndr_keyword_t		*kw;
164 	ndr_symbol_t		*sym;
165 
166 	for (i = 0; kwtable[i].name; i++) {
167 		kw = &kwtable[i];
168 
169 		sym = sym_enter(kw->name);
170 		sym->kw = kw;
171 	}
172 }
173 
174 void
175 set_lex_input(FILE *fp, char *name)
176 {
177 	keyw_tab_init(keywtable);
178 	keyw_tab_init(optable);
179 
180 	lex_infp = fp;
181 	file_name = sym_enter(name);
182 	line_number = 1;
183 	lex_at_bol = 1;
184 }
185 
186 static int
187 getch(FILE *fp)
188 {
189 	return (getc(fp));
190 }
191 
192 int
193 yylex(void)
194 {
195 	char		lexeme[512];
196 	char		*p = lexeme;
197 	FILE		*fp = lex_infp;
198 	int		c, xc;
199 	ndr_symbol_t	*sym;
200 	ndr_integer_t	*intg;
201 
202 top:
203 	p = lexeme;
204 
205 	c = getch(fp);
206 	if (c == EOF)
207 		return (EOF);
208 
209 	if (c == '\n') {
210 		line_number++;
211 		lex_at_bol = 1;
212 		goto top;
213 	}
214 
215 	/*
216 	 * Handle preprocessor lines. This just notes
217 	 * which file we're processing.
218 	 */
219 	if (c == '#' && lex_at_bol) {
220 		char		*sv[10];
221 		int		sc;
222 
223 		while ((c = getch(fp)) != EOF && c != '\n')
224 			*p++ = c;
225 
226 		*p = 0;
227 		/* note: no ungetc() of newline, we don't want to count it */
228 
229 		if (*lexeme != ' ') {
230 			/* not a line we know */
231 			goto top;
232 		}
233 
234 		sc = str_to_sv(lexeme, sv);
235 		if (sc < 2)
236 			goto top;
237 
238 		file_name = sym_enter(sv[1]);
239 		line_number = atoi(sv[0]);	/* for next input line */
240 		lex_at_bol = 1;
241 		goto top;
242 	}
243 
244 	lex_at_bol = 0;
245 
246 	/*
247 	 * Skip white space
248 	 */
249 	if (is_white(c))
250 		goto top;
251 
252 	/*
253 	 * Symbol? Might be a keyword or just an identifier
254 	 */
255 	if (is_sstart(c)) {
256 		/* we got a symbol */
257 		do {
258 			*p++ = c;
259 			c = getch(fp);
260 		} while (is_sfollow(c));
261 		(void) ungetc(c, fp);
262 		*p = 0;
263 
264 		sym = sym_enter(lexeme);
265 
266 		yylval = &sym->s_node;
267 
268 		if (sym->kw) {
269 			return (sym->kw->token);
270 		} else {
271 			return (IDENTIFIER);
272 		}
273 	}
274 
275 	/*
276 	 * Integer constant?
277 	 */
278 	if (is_digit(c)) {
279 		/* we got a number */
280 		*p++ = c;
281 		if (c == '0') {
282 			c = getch(fp);
283 			if (c == 'x' || c == 'X') {
284 				/* handle hex specially */
285 				do {
286 					*p++ = c;
287 					c = getch(fp);
288 				} while (is_xdigit(c));
289 				goto convert_icon;
290 			} else if (c == 'b' || c == 'B' ||
291 			    c == 'd' || c == 'D' ||
292 			    c == 'o' || c == 'O') {
293 				do {
294 					*p++ = c;
295 					c = getch(fp);
296 				} while (is_digit(c));
297 				goto convert_icon;
298 			}
299 			(void) ungetc(c, fp);
300 		}
301 		/* could be anything */
302 		c = getch(fp);
303 		while (is_digit(c)) {
304 			*p++ = c;
305 			c = getch(fp);
306 		}
307 
308 convert_icon:
309 		*p = 0;
310 		(void) ungetc(c, fp);
311 
312 		intg = int_enter(strtol(lexeme, 0, 0));
313 		yylval = &intg->s_node;
314 
315 		return (INTEGER);
316 	}
317 
318 	/* Could handle strings. We don't seem to need them yet */
319 
320 	yylval = 0;		/* operator tokens have no value */
321 	xc = getch(fp);		/* get look-ahead for two-char lexemes */
322 
323 	lexeme[0] = c;
324 	lexeme[1] = xc;
325 	lexeme[2] = 0;
326 
327 	/*
328 	 * Look for to-end-of-line comment
329 	 */
330 	if (c == '/' && xc == '/') {
331 		/* eat the comment */
332 		while ((c = getch(fp)) != EOF && c != '\n')
333 			;
334 		(void) ungetc(c, fp);		/* put back newline */
335 		goto top;
336 	}
337 
338 	/*
339 	 * Look for multi-line comment
340 	 */
341 	if (c == '/' && xc == '*') {
342 		/* eat the comment */
343 		xc = -1;
344 		while ((c = getch(fp)) != EOF) {
345 			if (xc == '*' && c == '/') {
346 				/* that's it */
347 				break;
348 			}
349 			xc = c;
350 			if (c == '\n')
351 				line_number++;
352 		}
353 		goto top;
354 	}
355 
356 	/*
357 	 * Use symbol table lookup for two-character and
358 	 * one character operator tokens.
359 	 */
360 	sym = sym_find(lexeme);
361 	if (sym) {
362 		/* there better be a keyword attached */
363 		yylval = &sym->s_node;
364 		return (sym->kw->token);
365 	}
366 
367 	/* Try a one-character form */
368 	(void) ungetc(xc, fp);
369 	lexeme[1] = 0;
370 	sym = sym_find(lexeme);
371 	if (sym) {
372 		/* there better be a keyword attached */
373 		yylval = &sym->s_node;
374 		return (sym->kw->token);
375 	}
376 
377 	if (is_between(c, ' ', '~'))
378 		compile_error("unrecognized character: 0x%02x (%c)", c, c);
379 	else
380 		compile_error("unrecognized character: 0x%02x", c);
381 	goto top;
382 }
383 
384 static ndr_symbol_t *
385 sym_find(char *name)
386 {
387 	ndr_symbol_t		**pp;
388 	ndr_symbol_t		*p;
389 
390 	for (pp = &symbol_list; (p = *pp) != 0; pp = &p->next) {
391 		if (strcmp(p->name, name) == 0)
392 			return (p);
393 	}
394 
395 	return (0);
396 }
397 
398 static ndr_symbol_t *
399 sym_enter(char *name)
400 {
401 	ndr_symbol_t		**pp;
402 	ndr_symbol_t		*p;
403 
404 	for (pp = &symbol_list; (p = *pp) != 0; pp = &p->next) {
405 		if (strcmp(p->name, name) == 0)
406 			return (p);
407 	}
408 
409 	p = ndr_alloc(1, sizeof (ndr_symbol_t));
410 
411 	if ((p->name = strdup(name)) == NULL)
412 		fatal_error("%s", strerror(ENOMEM));
413 
414 	p->s_node.label = IDENTIFIER;
415 	p->s_node.n_sym = p;
416 
417 	*pp = p;
418 
419 	return (p);
420 }
421 
422 static ndr_integer_t *
423 int_enter(long value)
424 {
425 	ndr_integer_t		**pp;
426 	ndr_integer_t		*p;
427 
428 	for (pp = &integer_list; (p = *pp) != 0; pp = &p->next) {
429 		if (p->value == value)
430 			return (p);
431 	}
432 
433 	p = ndr_alloc(1, sizeof (ndr_integer_t));
434 
435 	p->value = value;
436 	p->s_node.label = INTEGER;
437 	p->s_node.n_int = value;
438 
439 	*pp = p;
440 
441 	return (p);
442 }
443 
444 void *
445 ndr_alloc(size_t nelem, size_t elsize)
446 {
447 	void *p;
448 
449 	if ((p = calloc(nelem, elsize)) == NULL) {
450 		fatal_error("%s", strerror(ENOMEM));
451 		/* NOTREACHED */
452 	}
453 
454 	return (p);
455 }
456 
457 /*
458  * The input context (filename, line number) is maintained by the
459  * lexical analysis, and we generally want such info reported for
460  * errors in a consistent manner.
461  */
462 void
463 compile_error(const char *fmt, ...)
464 {
465 	char	buf[NDLBUFSZ];
466 	va_list ap;
467 
468 	va_start(ap, fmt);
469 	(void) vsnprintf(buf, NDLBUFSZ, fmt, ap);
470 	va_end(ap);
471 
472 	(void) fprintf(stderr, "ndrgen: compile error: %s:%d: %s\n",
473 	    file_name->name, line_number, buf);
474 
475 	n_compile_error++;
476 }
477 
478 void
479 fatal_error(const char *fmt, ...)
480 {
481 	char	buf[NDLBUFSZ];
482 	va_list ap;
483 
484 	va_start(ap, fmt);
485 	(void) vsnprintf(buf, NDLBUFSZ, fmt, ap);
486 	va_end(ap);
487 
488 	(void) fprintf(stderr, "ndrgen: fatal error: %s\n", buf);
489 	exit(1);
490 }
491 
492 /*
493  * Setup nodes for the lexical analyzer.
494  */
495 struct node *
496 n_cons(int label, ...)
497 {
498 	ndr_node_t		*np;
499 	va_list ap;
500 
501 	np = ndr_alloc(1, sizeof (ndr_node_t));
502 
503 	va_start(ap, label);
504 	np->label = label;
505 	np->n_arg[0] = va_arg(ap, void *);
506 	np->n_arg[1] = va_arg(ap, void *);
507 	np->n_arg[2] = va_arg(ap, void *);
508 	va_end(ap);
509 
510 	np->line_number = line_number;
511 	np->file_name = file_name;
512 
513 	return (np);
514 }
515 
516 /*
517  *	list:	item
518  *	|	list item	={ n_splice($1, $2); }
519  *	;
520  */
521 void
522 n_splice(struct node *np1, struct node *np2)
523 {
524 	while (np1->n_next)
525 		np1 = np1->n_next;
526 
527 	np1->n_next = np2;
528 }
529 
530 /*
531  * Convert a string of words to a vector of strings.
532  * Returns the number of words.
533  */
534 static int
535 str_to_sv(char *buf, char *sv[])
536 {
537 	char		**pp = sv;
538 	char		*p = buf;
539 	char		*q = buf;
540 	int		in_word = 0;
541 	int		c;
542 
543 	for (;;) {
544 		c = *p++;
545 		if (c == 0)
546 			break;
547 
548 		if (!in_word) {
549 			if (iswhite(c))
550 				continue;
551 
552 			*pp++ = q;
553 			in_word = 1;
554 		}
555 
556 		if (isquote(c)) {
557 			int		qc = c;
558 
559 			while (((c = *p++) != 0) && (c != qc))
560 				*q++ = c;
561 			if (c == 0)
562 				break;
563 		} else if (iswhite(c)) {
564 			/* end of word */
565 			*q++ = 0;
566 			in_word = 0;
567 		} else {
568 			/* still inside word */
569 			*q++ = c;
570 		}
571 	}
572 
573 	if (in_word)
574 		*q++ = 0;
575 
576 	*pp = (char *)0;
577 	return (pp - sv);
578 }
579