tools/ndrgen/ndr_lex.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <errno.h>
#include <stdarg.h>
#include "ndrgen.h"
#include "y.tab.h"

/*
 * C-like lexical analysis.
 *
 * 1. Define a "struct node"
 * 2. Define a "struct symbol" that encapsulates a struct node.
 * 3. Define a "struct integer" that encapsulates a struct node.
 * 4. Set the YACC stack type in the grammar:
 *		%{
 *		#define YYSTYPE struct node *
 *		%}
 * 5. Define %token's in the grammer for IDENTIFIER, STRING and INTEGER.
 *    Using "_KW" as a suffix for keyword tokens, i.e. "struct" is
 *    "%token STRUCT_KW":
 *	// atomic values
 *	%token INTEGER STRING IDENTIFIER
 *	// keywords
 *	%token STRUCT_KW CASE_KW
 *	// operators
 *	%token PLUS MINUS ASSIGN ARROW
 *	// overloaded tokens (++ --, < > <= >=, == !=, += -= *= ...)
 *	%token INCOP RELOP EQUOP ASSOP
 * 6. It's easiest to use the yacc(1) generated token numbers for node
 *    labels.  For node labels that are not actually part of the grammer,
 *    use a %token with an L_ prefix:
 *	// node labels (can't be generated by lex)
 *	%token L_LT L_LTE L_GT L_GTE L_EQU L_NEQ
 * 7. Call set_lex_input() before parsing.
 */

#define	SQ	'\''
#define	DQ	'"'

#define	isquote(c) ((c) == SQ || (c) == DQ)
#define	iswhite(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\f')

#define	is_between(c, l, u)  ((l) <= (c) && (c) <= (u))
#define	is_white(c)	((c) == ' ' || c == '\r' || c == '\t' || c == '\f')
#define	is_lower(c)	is_between((c), 'a', 'z')
#define	is_upper(c)	is_between((c), 'A', 'Z')
#define	is_alpha(c)	(is_lower(c) || is_upper(c))
#define	is_digit(c)	is_between((c), '0', '9')
#define	is_sstart(c)	(is_alpha(c) || (c) == '_')
#define	is_sfollow(c)	(is_sstart(c) || is_digit(c))
#define	is_xdigit(c)	\
	(is_digit(c) || is_between((c), 'A', 'F') || is_between((c), 'a', 'f'))

ndr_symbol_t		*symbol_list;
static ndr_integer_t	*integer_list;
static FILE		*lex_infp;
static ndr_symbol_t	*file_name;
int			line_number;
int			n_compile_error;

static int		lex_at_bol;

/* In yacc(1) generated parser */
extern struct node	*yylval;

/*
 * The keywtab[] and optable[] could be external to this lex
 * and it would all still work.
 */
static ndr_keyword_t keywtable[] = {
	{ "struct",	STRUCT_KW,	0 },
	{ "union",	UNION_KW,	0 },
	{ "typedef",	TYPEDEF_KW,	0 },

	{ "interface",	INTERFACE_KW,	0 },
	{ "uuid",	UUID_KW,	0 },
	{ "_no_reorder", _NO_REORDER_KW, 0 },
	{ "extern",	EXTERN_KW,	0 },
	{ "reference",	REFERENCE_KW,	0 },

	{ "align",	ALIGN_KW,	0 },
	{ "operation",	OPERATION_KW,	0 },
	{ "in",		IN_KW,		0 },
	{ "out",	OUT_KW,		0 },

	{ "string",	STRING_KW,	0 },
	{ "size_is",	SIZE_IS_KW,	0 },
	{ "length_is",	LENGTH_IS_KW,	0 },

	{ "switch_is",	SWITCH_IS_KW,	0 },
	{ "case",	CASE_KW,	0 },
	{ "default",	DEFAULT_KW,	0 },

	{ "transmit_as", TRANSMIT_AS_KW, 0 },
	{ "arg_is",	ARG_IS_KW,	0 },

	{ "char",	BASIC_TYPE,	1 },
	{ "uchar",	BASIC_TYPE,	1 },
	{ "wchar",	BASIC_TYPE,	2 },
	{ "short",	BASIC_TYPE,	2 },
	{ "ushort",	BASIC_TYPE,	2 },
	{ "long",	BASIC_TYPE,	4 },
	{ "ulong",	BASIC_TYPE,	4 },
	{0}
};

static ndr_keyword_t optable[] = {
	{ "{",		LC,		0 },
	{ "}",		RC,		0 },
	{ "(",		LP,		0 },
	{ ")",		RP,		0 },
	{ "[",		LB,		0 },
	{ "]",		RB,		0 },
	{ "*",		STAR,		0 },
	{ "/",		DIV,		0 },
	{ "%",		MOD,		0 },
	{ "-",		MINUS,		0 },
	{ "+",		PLUS,		0 },
	{ "&",		AND,		0 },
	{ "|",		OR,		0 },
	{ "^",		XOR,		0 },
	{ ";",		SEMI,		0 },
	{0}
};

static int getch(FILE *fp);
static ndr_integer_t *int_enter(long);
static ndr_symbol_t *sym_enter(char *);
static ndr_symbol_t *sym_find(char *);
static int str_to_sv(char *, char *sv[]);

/*
 * Enter the symbols for keyword.
 */
static void
keyw_tab_init(ndr_keyword_t kwtable[])
{
	int			i;
	ndr_keyword_t		*kw;
	ndr_symbol_t		*sym;

	for (i = 0; kwtable[i].name; i++) {
		kw = &kwtable[i];

		sym = sym_enter(kw->name);
		sym->kw = kw;
	}
}

void
set_lex_input(FILE *fp, char *name)
{
	keyw_tab_init(keywtable);
	keyw_tab_init(optable);

	lex_infp = fp;
	file_name = sym_enter(name);
	line_number = 1;
	lex_at_bol = 1;
}

static int
getch(FILE *fp)
{
	return (getc(fp));
}

int
yylex(void)
{
	char		lexeme[512];
	char		*p = lexeme;
	FILE		*fp = lex_infp;
	int		c, xc;
	ndr_symbol_t	*sym;
	ndr_integer_t	*intg;

top:
	p = lexeme;

	c = getch(fp);
	if (c == EOF)
		return (EOF);

	if (c == '\n') {
		line_number++;
		lex_at_bol = 1;
		goto top;
	}

	/*
	 * Handle preprocessor lines. This just notes
	 * which file we're processing.
	 */
	if (c == '#' && lex_at_bol) {
		char		*sv[10];
		int		sc;

		while ((c = getch(fp)) != EOF && c != '\n')
			*p++ = c;

		*p = 0;
		/* note: no ungetc() of newline, we don't want to count it */

		if (*lexeme != ' ') {
			/* not a line we know */
			goto top;
		}

		sc = str_to_sv(lexeme, sv);
		if (sc < 2)
			goto top;

		file_name = sym_enter(sv[1]);
		line_number = atoi(sv[0]);	/* for next input line */
		lex_at_bol = 1;
		goto top;
	}

	lex_at_bol = 0;

	/*
	 * Skip white space
	 */
	if (is_white(c))
		goto top;

	/*
	 * Symbol? Might be a keyword or just an identifier
	 */
	if (is_sstart(c)) {
		/* we got a symbol */
		do {
			*p++ = c;
			c = getch(fp);
		} while (is_sfollow(c));
		(void) ungetc(c, fp);
		*p = 0;

		sym = sym_enter(lexeme);

		yylval = &sym->s_node;

		if (sym->kw) {
			return (sym->kw->token);
		} else {
			return (IDENTIFIER);
		}
	}

	/*
	 * Integer constant?
	 */
	if (is_digit(c)) {
		/* we got a number */
		*p++ = c;
		if (c == '0') {
			c = getch(fp);
			if (c == 'x' || c == 'X') {
				/* handle hex specially */
				do {
					*p++ = c;
					c = getch(fp);
				} while (is_xdigit(c));
				goto convert_icon;
			} else if (c == 'b' || c == 'B' ||
			    c == 'd' || c == 'D' ||
			    c == 'o' || c == 'O') {
				do {
					*p++ = c;
					c = getch(fp);
				} while (is_digit(c));
				goto convert_icon;
			}
			(void) ungetc(c, fp);
		}
		/* could be anything */
		c = getch(fp);
		while (is_digit(c)) {
			*p++ = c;
			c = getch(fp);
		}

convert_icon:
		*p = 0;
		(void) ungetc(c, fp);

		intg = int_enter(strtol(lexeme, 0, 0));
		yylval = &intg->s_node;

		return (INTEGER);
	}

	/* Could handle strings. We don't seem to need them yet */

	yylval = 0;		/* operator tokens have no value */
	xc = getch(fp);		/* get look-ahead for two-char lexemes */

	lexeme[0] = c;
	lexeme[1] = xc;
	lexeme[2] = 0;

	/*
	 * Look for to-end-of-line comment
	 */
	if (c == '/' && xc == '/') {
		/* eat the comment */
		while ((c = getch(fp)) != EOF && c != '\n')
			;
		(void) ungetc(c, fp);		/* put back newline */
		goto top;
	}

	/*
	 * Look for multi-line comment
	 */
	if (c == '/' && xc == '*') {
		/* eat the comment */
		xc = -1;
		while ((c = getch(fp)) != EOF) {
			if (xc == '*' && c == '/') {
				/* that's it */
				break;
			}
			xc = c;
			if (c == '\n')
				line_number++;
		}
		goto top;
	}

	/*
	 * Use symbol table lookup for two-character and
	 * one character operator tokens.
	 */
	sym = sym_find(lexeme);
	if (sym) {
		/* there better be a keyword attached */
		yylval = &sym->s_node;
		return (sym->kw->token);
	}

	/* Try a one-character form */
	(void) ungetc(xc, fp);
	lexeme[1] = 0;
	sym = sym_find(lexeme);
	if (sym) {
		/* there better be a keyword attached */
		yylval = &sym->s_node;
		return (sym->kw->token);
	}

	if (is_between(c, ' ', '~'))
		compile_error("unrecognized character: 0x%02x (%c)", c, c);
	else
		compile_error("unrecognized character: 0x%02x", c);
	goto top;
}

static ndr_symbol_t *
sym_find(char *name)
{
	ndr_symbol_t		**pp;
	ndr_symbol_t		*p;

	for (pp = &symbol_list; (p = *pp) != 0; pp = &p->next) {
		if (strcmp(p->name, name) == 0)
			return (p);
	}

	return (0);
}

static ndr_symbol_t *
sym_enter(char *name)
{
	ndr_symbol_t		**pp;
	ndr_symbol_t		*p;

	for (pp = &symbol_list; (p = *pp) != 0; pp = &p->next) {
		if (strcmp(p->name, name) == 0)
			return (p);
	}

	p = ndr_alloc(1, sizeof (ndr_symbol_t));

	if ((p->name = strdup(name)) == NULL)
		fatal_error("%s", strerror(ENOMEM));

	p->s_node.label = IDENTIFIER;
	p->s_node.n_sym = p;

	*pp = p;

	return (p);
}

static ndr_integer_t *
int_enter(long value)
{
	ndr_integer_t		**pp;
	ndr_integer_t		*p;

	for (pp = &integer_list; (p = *pp) != 0; pp = &p->next) {
		if (p->value == value)
			return (p);
	}

	p = ndr_alloc(1, sizeof (ndr_integer_t));

	p->value = value;
	p->s_node.label = INTEGER;
	p->s_node.n_int = value;

	*pp = p;

	return (p);
}

void *
ndr_alloc(size_t nelem, size_t elsize)
{
	void *p;

	if ((p = calloc(nelem, elsize)) == NULL) {
		fatal_error("%s", strerror(ENOMEM));
		/* NOTREACHED */
	}

	return (p);
}

/*
 * The input context (filename, line number) is maintained by the
 * lexical analysis, and we generally want such info reported for
 * errors in a consistent manner.
 */
void
compile_error(const char *fmt, ...)
{
	char	buf[NDLBUFSZ];
	va_list ap;

	va_start(ap, fmt);
	(void) vsnprintf(buf, NDLBUFSZ, fmt, ap);
	va_end(ap);

	(void) fprintf(stderr, "ndrgen: compile error: %s:%d: %s\n",
	    file_name->name, line_number, buf);

	n_compile_error++;
}

void
fatal_error(const char *fmt, ...)
{
	char	buf[NDLBUFSZ];
	va_list ap;

	va_start(ap, fmt);
	(void) vsnprintf(buf, NDLBUFSZ, fmt, ap);
	va_end(ap);

	(void) fprintf(stderr, "ndrgen: fatal error: %s\n", buf);
	exit(1);
}

/*
 * Setup nodes for the lexical analyzer.
 */
struct node *
n_cons(int label, ...)
{
	ndr_node_t		*np;
	va_list ap;

	np = ndr_alloc(1, sizeof (ndr_node_t));

	va_start(ap, label);
	np->label = label;
	np->n_arg[0] = va_arg(ap, void *);
	np->n_arg[1] = va_arg(ap, void *);
	np->n_arg[2] = va_arg(ap, void *);
	va_end(ap);

	np->line_number = line_number;
	np->file_name = file_name;

	return (np);
}

/*
 *	list:	item
 *	|	list item	={ n_splice($1, $2); }
 *	;
 */
void
n_splice(struct node *np1, struct node *np2)
{
	while (np1->n_next)
		np1 = np1->n_next;

	np1->n_next = np2;
}

/*
 * Convert a string of words to a vector of strings.
 * Returns the number of words.
 */
static int
str_to_sv(char *buf, char *sv[])
{
	char		**pp = sv;
	char		*p = buf;
	char		*q = buf;
	int		in_word = 0;
	int		c;

	for (;;) {
		c = *p++;
		if (c == 0)
			break;

		if (!in_word) {
			if (iswhite(c))
				continue;

			*pp++ = q;
			in_word = 1;
		}

		if (isquote(c)) {
			int		qc = c;

			while (((c = *p++) != 0) && (c != qc))
				*q++ = c;
			if (c == 0)
				break;
		} else if (iswhite(c)) {
			/* end of word */
			*q++ = 0;
			in_word = 0;
		} else {
			/* still inside word */
			*q++ = c;
		}
	}

	if (in_word)
		*q++ = 0;

	*pp = (char *)0;
	return (pp - sv);
}