xref: /illumos-gate/usr/src/tools/ndrgen/ndr_lex.c (revision 63f91fbc3c024870d86dc3332a4a0080fb29bc40)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2020 Tintri by DDN, Inc. All rights reserved.
29  */
30 
31 #include <errno.h>
32 #include <stdarg.h>
33 #include "ndrgen.h"
34 #include "y.tab.h"
35 
36 /*
37  * C-like lexical analysis.
38  *
39  * 1. Define a "struct node"
40  * 2. Define a "struct symbol" that encapsulates a struct node.
41  * 3. Define a "struct integer" that encapsulates a struct node.
42  * 4. Set the YACC stack type in the grammar:
43  *		%{
44  *		#define YYSTYPE struct node *
45  *		%}
46  * 5. Define %token's in the grammer for IDENTIFIER, STRING and INTEGER.
47  *    Using "_KW" as a suffix for keyword tokens, i.e. "struct" is
48  *    "%token STRUCT_KW":
49  *	// atomic values
50  *	%token INTEGER STRING IDENTIFIER
51  *	// keywords
52  *	%token STRUCT_KW CASE_KW
53  *	// operators
54  *	%token PLUS MINUS ASSIGN ARROW
55  *	// overloaded tokens (++ --, < > <= >=, == !=, += -= *= ...)
56  *	%token INCOP RELOP EQUOP ASSOP
57  * 6. It's easiest to use the yacc(1) generated token numbers for node
58  *    labels.  For node labels that are not actually part of the grammer,
59  *    use a %token with an L_ prefix:
60  *	// node labels (can't be generated by lex)
61  *	%token L_LT L_LTE L_GT L_GTE L_EQU L_NEQ
62  * 7. Call set_lex_input() before parsing.
63  */
64 
65 #define	SQ	'\''
66 #define	DQ	'"'
67 
68 #define	isquote(c) ((c) == SQ || (c) == DQ)
69 #define	iswhite(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\f')
70 
71 #define	is_between(c, l, u)  ((l) <= (c) && (c) <= (u))
72 #define	is_white(c)	((c) == ' ' || c == '\r' || c == '\t' || c == '\f')
73 #define	is_lower(c)	is_between((c), 'a', 'z')
74 #define	is_upper(c)	is_between((c), 'A', 'Z')
75 #define	is_alpha(c)	(is_lower(c) || is_upper(c))
76 #define	is_digit(c)	is_between((c), '0', '9')
77 #define	is_sstart(c)	(is_alpha(c) || (c) == '_')
78 #define	is_sfollow(c)	(is_sstart(c) || is_digit(c))
79 #define	is_xdigit(c)	\
80 	(is_digit(c) || is_between((c), 'A', 'F') || is_between((c), 'a', 'f'))
81 
82 ndr_symbol_t		*symbol_list;
83 static ndr_integer_t	*integer_list;
84 static FILE		*lex_infp;
85 static ndr_symbol_t	*file_name;
86 int			line_number;
87 int			n_compile_error;
88 
89 static int		lex_at_bol;
90 
91 /* In yacc(1) generated parser */
92 extern struct node	*yylval;
93 
94 /*
95  * The keywtab[] and optable[] could be external to this lex
96  * and it would all still work.
97  */
98 static ndr_keyword_t keywtable[] = {
99 	{ "struct",	STRUCT_KW,	0 },
100 	{ "union",	UNION_KW,	0 },
101 	{ "typedef",	TYPEDEF_KW,	0 },
102 
103 	{ "interface",	INTERFACE_KW,	0 },
104 	{ "uuid",	UUID_KW,	0 },
105 	{ "_no_reorder", _NO_REORDER_KW, 0 },
106 	{ "extern",	EXTERN_KW,	0 },
107 	{ "reference",	REFERENCE_KW,	0 },
108 
109 	{ "align",	ALIGN_KW,	0 },
110 	{ "operation",	OPERATION_KW,	0 },
111 	{ "in",		IN_KW,		0 },
112 	{ "out",	OUT_KW,		0 },
113 
114 	{ "string",	STRING_KW,	0 },
115 	{ "size_is",	SIZE_IS_KW,	0 },
116 	{ "length_is",	LENGTH_IS_KW,	0 },
117 
118 	{ "switch_is",	SWITCH_IS_KW,	0 },
119 	{ "case",	CASE_KW,	0 },
120 	{ "default",	DEFAULT_KW,	0 },
121 
122 	{ "transmit_as", TRANSMIT_AS_KW, 0 },
123 	{ "arg_is",	ARG_IS_KW,	0 },
124 	{ "fake",	FAKE_KW,	0 },
125 
126 	{ "char",	BASIC_TYPE,	1 },
127 	{ "uchar",	BASIC_TYPE,	1 },
128 	{ "wchar",	BASIC_TYPE,	2 },
129 	{ "short",	BASIC_TYPE,	2 },
130 	{ "ushort",	BASIC_TYPE,	2 },
131 	{ "long",	BASIC_TYPE,	4 },
132 	{ "ulong",	BASIC_TYPE,	4 },
133 	{0}
134 };
135 
136 static ndr_keyword_t optable[] = {
137 	{ "{",		LC,		0 },
138 	{ "}",		RC,		0 },
139 	{ "(",		LP,		0 },
140 	{ ")",		RP,		0 },
141 	{ "[",		LB,		0 },
142 	{ "]",		RB,		0 },
143 	{ "*",		STAR,		0 },
144 	{ "/",		DIV,		0 },
145 	{ "%",		MOD,		0 },
146 	{ "-",		MINUS,		0 },
147 	{ "+",		PLUS,		0 },
148 	{ "&",		AND,		0 },
149 	{ "|",		OR,		0 },
150 	{ "^",		XOR,		0 },
151 	{ ";",		SEMI,		0 },
152 	{0}
153 };
154 
155 static int getch(FILE *fp);
156 static ndr_integer_t *int_enter(long);
157 static ndr_symbol_t *sym_enter(char *);
158 static ndr_symbol_t *sym_find(char *);
159 static int str_to_sv(char *, char *sv[]);
160 
161 /*
162  * Enter the symbols for keyword.
163  */
164 static void
165 keyw_tab_init(ndr_keyword_t kwtable[])
166 {
167 	int			i;
168 	ndr_keyword_t		*kw;
169 	ndr_symbol_t		*sym;
170 
171 	for (i = 0; kwtable[i].name; i++) {
172 		kw = &kwtable[i];
173 
174 		sym = sym_enter(kw->name);
175 		sym->kw = kw;
176 	}
177 }
178 
179 void
180 set_lex_input(FILE *fp, char *name)
181 {
182 	keyw_tab_init(keywtable);
183 	keyw_tab_init(optable);
184 
185 	lex_infp = fp;
186 	file_name = sym_enter(name);
187 	line_number = 1;
188 	lex_at_bol = 1;
189 }
190 
191 static int
192 getch(FILE *fp)
193 {
194 	return (getc(fp));
195 }
196 
197 int
198 yylex(void)
199 {
200 	char		lexeme[512];
201 	char		*p = lexeme;
202 	FILE		*fp = lex_infp;
203 	int		c, xc;
204 	ndr_symbol_t	*sym;
205 	ndr_integer_t	*intg;
206 
207 top:
208 	p = lexeme;
209 
210 	c = getch(fp);
211 	if (c == EOF)
212 		return (EOF);
213 
214 	if (c == '\n') {
215 		line_number++;
216 		lex_at_bol = 1;
217 		goto top;
218 	}
219 
220 	/*
221 	 * Handle preprocessor lines. This just notes
222 	 * which file we're processing.
223 	 */
224 	if (c == '#' && lex_at_bol) {
225 		char		*sv[10];
226 		int		sc;
227 
228 		while ((c = getch(fp)) != EOF && c != '\n')
229 			*p++ = c;
230 
231 		*p = 0;
232 		/* note: no ungetc() of newline, we don't want to count it */
233 
234 		if (*lexeme != ' ') {
235 			/* not a line we know */
236 			goto top;
237 		}
238 
239 		sc = str_to_sv(lexeme, sv);
240 		if (sc < 2)
241 			goto top;
242 
243 		file_name = sym_enter(sv[1]);
244 		line_number = atoi(sv[0]);	/* for next input line */
245 		lex_at_bol = 1;
246 		goto top;
247 	}
248 
249 	lex_at_bol = 0;
250 
251 	/*
252 	 * Skip white space
253 	 */
254 	if (is_white(c))
255 		goto top;
256 
257 	/*
258 	 * Symbol? Might be a keyword or just an identifier
259 	 */
260 	if (is_sstart(c)) {
261 		/* we got a symbol */
262 		do {
263 			*p++ = c;
264 			c = getch(fp);
265 		} while (is_sfollow(c));
266 		(void) ungetc(c, fp);
267 		*p = 0;
268 
269 		sym = sym_enter(lexeme);
270 
271 		yylval = &sym->s_node;
272 
273 		if (sym->kw) {
274 			return (sym->kw->token);
275 		} else {
276 			return (IDENTIFIER);
277 		}
278 	}
279 
280 	/*
281 	 * Integer constant?
282 	 */
283 	if (is_digit(c)) {
284 		/* we got a number */
285 		*p++ = c;
286 		if (c == '0') {
287 			c = getch(fp);
288 			if (c == 'x' || c == 'X') {
289 				/* handle hex specially */
290 				do {
291 					*p++ = c;
292 					c = getch(fp);
293 				} while (is_xdigit(c));
294 				goto convert_icon;
295 			} else if (c == 'b' || c == 'B' ||
296 			    c == 'd' || c == 'D' ||
297 			    c == 'o' || c == 'O') {
298 				do {
299 					*p++ = c;
300 					c = getch(fp);
301 				} while (is_digit(c));
302 				goto convert_icon;
303 			}
304 			(void) ungetc(c, fp);
305 		}
306 		/* could be anything */
307 		c = getch(fp);
308 		while (is_digit(c)) {
309 			*p++ = c;
310 			c = getch(fp);
311 		}
312 
313 convert_icon:
314 		*p = 0;
315 		(void) ungetc(c, fp);
316 
317 		intg = int_enter(strtol(lexeme, 0, 0));
318 		yylval = &intg->s_node;
319 
320 		return (INTEGER);
321 	}
322 
323 	/* Could handle strings. We don't seem to need them yet */
324 
325 	yylval = 0;		/* operator tokens have no value */
326 	xc = getch(fp);		/* get look-ahead for two-char lexemes */
327 
328 	lexeme[0] = c;
329 	lexeme[1] = xc;
330 	lexeme[2] = 0;
331 
332 	/*
333 	 * Look for to-end-of-line comment
334 	 */
335 	if (c == '/' && xc == '/') {
336 		/* eat the comment */
337 		while ((c = getch(fp)) != EOF && c != '\n')
338 			;
339 		(void) ungetc(c, fp);		/* put back newline */
340 		goto top;
341 	}
342 
343 	/*
344 	 * Look for multi-line comment
345 	 */
346 	if (c == '/' && xc == '*') {
347 		/* eat the comment */
348 		xc = -1;
349 		while ((c = getch(fp)) != EOF) {
350 			if (xc == '*' && c == '/') {
351 				/* that's it */
352 				break;
353 			}
354 			xc = c;
355 			if (c == '\n')
356 				line_number++;
357 		}
358 		goto top;
359 	}
360 
361 	/*
362 	 * Use symbol table lookup for two-character and
363 	 * one character operator tokens.
364 	 */
365 	sym = sym_find(lexeme);
366 	if (sym) {
367 		/* there better be a keyword attached */
368 		yylval = &sym->s_node;
369 		return (sym->kw->token);
370 	}
371 
372 	/* Try a one-character form */
373 	(void) ungetc(xc, fp);
374 	lexeme[1] = 0;
375 	sym = sym_find(lexeme);
376 	if (sym) {
377 		/* there better be a keyword attached */
378 		yylval = &sym->s_node;
379 		return (sym->kw->token);
380 	}
381 
382 	if (is_between(c, ' ', '~'))
383 		compile_error("unrecognized character: 0x%02x (%c)", c, c);
384 	else
385 		compile_error("unrecognized character: 0x%02x", c);
386 	goto top;
387 }
388 
389 static ndr_symbol_t *
390 sym_find(char *name)
391 {
392 	ndr_symbol_t		**pp;
393 	ndr_symbol_t		*p;
394 
395 	for (pp = &symbol_list; (p = *pp) != 0; pp = &p->next) {
396 		if (strcmp(p->name, name) == 0)
397 			return (p);
398 	}
399 
400 	return (0);
401 }
402 
403 static ndr_symbol_t *
404 sym_enter(char *name)
405 {
406 	ndr_symbol_t		**pp;
407 	ndr_symbol_t		*p;
408 
409 	for (pp = &symbol_list; (p = *pp) != 0; pp = &p->next) {
410 		if (strcmp(p->name, name) == 0)
411 			return (p);
412 	}
413 
414 	p = ndr_alloc(1, sizeof (ndr_symbol_t));
415 
416 	if ((p->name = strdup(name)) == NULL)
417 		fatal_error("%s", strerror(ENOMEM));
418 
419 	p->s_node.label = IDENTIFIER;
420 	p->s_node.n_sym = p;
421 
422 	*pp = p;
423 
424 	return (p);
425 }
426 
427 static ndr_integer_t *
428 int_enter(long value)
429 {
430 	ndr_integer_t		**pp;
431 	ndr_integer_t		*p;
432 
433 	for (pp = &integer_list; (p = *pp) != 0; pp = &p->next) {
434 		if (p->value == value)
435 			return (p);
436 	}
437 
438 	p = ndr_alloc(1, sizeof (ndr_integer_t));
439 
440 	p->value = value;
441 	p->s_node.label = INTEGER;
442 	p->s_node.n_int = value;
443 
444 	*pp = p;
445 
446 	return (p);
447 }
448 
449 void *
450 ndr_alloc(size_t nelem, size_t elsize)
451 {
452 	void *p;
453 
454 	if ((p = calloc(nelem, elsize)) == NULL) {
455 		fatal_error("%s", strerror(ENOMEM));
456 		/* NOTREACHED */
457 	}
458 
459 	return (p);
460 }
461 
462 /*
463  * The input context (filename, line number) is maintained by the
464  * lexical analysis, and we generally want such info reported for
465  * errors in a consistent manner.
466  */
467 void
468 compile_error(const char *fmt, ...)
469 {
470 	char	buf[NDLBUFSZ];
471 	va_list ap;
472 
473 	va_start(ap, fmt);
474 	(void) vsnprintf(buf, NDLBUFSZ, fmt, ap);
475 	va_end(ap);
476 
477 	(void) fprintf(stderr, "ndrgen: compile error: %s:%d: %s\n",
478 	    file_name->name, line_number, buf);
479 
480 	n_compile_error++;
481 }
482 
483 void
484 fatal_error(const char *fmt, ...)
485 {
486 	char	buf[NDLBUFSZ];
487 	va_list ap;
488 
489 	va_start(ap, fmt);
490 	(void) vsnprintf(buf, NDLBUFSZ, fmt, ap);
491 	va_end(ap);
492 
493 	(void) fprintf(stderr, "ndrgen: fatal error: %s\n", buf);
494 	exit(1);
495 }
496 
497 /*
498  * Setup nodes for the lexical analyzer.
499  */
500 struct node *
501 n_cons(int label, ...)
502 {
503 	ndr_node_t		*np;
504 	va_list ap;
505 
506 	np = ndr_alloc(1, sizeof (ndr_node_t));
507 
508 	va_start(ap, label);
509 	np->label = label;
510 	np->n_arg[0] = va_arg(ap, void *);
511 	np->n_arg[1] = va_arg(ap, void *);
512 	np->n_arg[2] = va_arg(ap, void *);
513 	va_end(ap);
514 
515 	np->line_number = line_number;
516 	np->file_name = file_name;
517 
518 	return (np);
519 }
520 
521 /*
522  *	list:	item
523  *	|	list item	={ n_splice($1, $2); }
524  *	;
525  */
526 void
527 n_splice(struct node *np1, struct node *np2)
528 {
529 	while (np1->n_next)
530 		np1 = np1->n_next;
531 
532 	np1->n_next = np2;
533 }
534 
535 /*
536  * Convert a string of words to a vector of strings.
537  * Returns the number of words.
538  */
539 static int
540 str_to_sv(char *buf, char *sv[])
541 {
542 	char		**pp = sv;
543 	char		*p = buf;
544 	char		*q = buf;
545 	int		in_word = 0;
546 	int		c;
547 
548 	for (;;) {
549 		c = *p++;
550 		if (c == 0)
551 			break;
552 
553 		if (!in_word) {
554 			if (iswhite(c))
555 				continue;
556 
557 			*pp++ = q;
558 			in_word = 1;
559 		}
560 
561 		if (isquote(c)) {
562 			int		qc = c;
563 
564 			while (((c = *p++) != 0) && (c != qc))
565 				*q++ = c;
566 			if (c == 0)
567 				break;
568 		} else if (iswhite(c)) {
569 			/* end of word */
570 			*q++ = 0;
571 			in_word = 0;
572 		} else {
573 			/* still inside word */
574 			*q++ = c;
575 		}
576 	}
577 
578 	if (in_word)
579 		*q++ = 0;
580 
581 	*pp = (char *)0;
582 	return (pp - sv);
583 }
584