awk1.c (revision 1a90c98d7539778aeb0a1d20f735b66aaba17fca) - OpenGrok cross reference for /illumos-gate/usr/src/cmd/awk_xpg4/awk1.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Copyright 1986, 1994 by Mortice Kern Systems Inc.  All rights reserved.
 */

/*
 * awk -- mainline, yylex, etc.
 *
 * Based on MKS awk(1) ported to be /usr/xpg4/bin/awk with POSIX/XCU4 changes
 */

#include "awk.h"
#include "y.tab.h"
#include <stdarg.h>
#include <unistd.h>
#include <locale.h>
#include <search.h>

static char	*progfiles[NPFILE];	/* Programmes files for yylex */
static char	**progfilep = &progfiles[0]; /* Pointer to last file */
static wchar_t	*progptr;		/* In-memory programme */
static int	proglen;		/* Length of progptr */
static wchar_t	context[NCONTEXT];	/* Circular buffer of context */
static wchar_t	*conptr = &context[0];	/* context ptr */
static FILE	*progfp;		/* Stdio stream for programme */
static char	*filename;
#ifdef	DEBUG
static int	dflag;
#endif

#define	AWK_EXEC_MAGIC	"<MKS AWKC>"
#define	LEN_EXEC_MAGIC	10

static char	unbal[] = "unbalanced E char";

static void	awkarginit(int c, char **av);
static int	lexid(wint_t c);
static int	lexnumber(wint_t c);
static int	lexstring(wint_t endc);
static int	lexregexp(wint_t endc);

static void	awkvarinit(void);
static wint_t	lexgetc(void);
static void	lexungetc(wint_t c);
static size_t	lexescape(wint_t endc, int regx, int cmd_line_operand);
static void	awkierr(int perr, const char *fmt, va_list ap) __NORETURN;
static int	usage(void);
void		strescape(wchar_t *str);
static const char	*toprint(wint_t);
char *_cmdname;
static wchar_t *mbconvert(char *str);

extern int	isclvar(wchar_t *arg);

/*
 * mainline for awk
 */
int
main(int argc, char *argv[])
{
	wchar_t *ap;
	char *cmd;

	cmd = argv[0];
	_cmdname = cmd;

	linebuf = emalloc(NLINE * sizeof (wchar_t));

	/*
	 * At this point only messaging should be internationalized.
	 * numbers are still scanned as in the Posix locale.
	 */
	(void) setlocale(LC_ALL, "");
	(void) setlocale(LC_NUMERIC, "C");
#if !defined(TEXT_DOMAIN)
#define	TEXT_DOMAIN	"SYS_TEST"
#endif
	(void) textdomain(TEXT_DOMAIN);

	awkvarinit();
	/* running = 1; */
	while (argc > 1 && *argv[1] == '-') {
		void *save_ptr = NULL;
		ap = mbstowcsdup(&argv[1][1]);
		if (ap == NULL)
			break;
		if (*ap == '\0') {
			free(ap);
			break;
		}
		save_ptr = (void *) ap;
		++argv;
		--argc;
		if (*ap == '-' && ap[1] == '\0')
			break;
		for (; *ap != '\0'; ++ap) {
			switch (*ap) {
#ifdef DEBUG
			case 'd':
				dflag = 1;
				continue;

#endif
			case 'f':
				if (argc < 2) {
					(void) fprintf(stderr,
				gettext("Missing script file\n"));
					return (1);
				}
				*progfilep++ = argv[1];
				--argc;
				++argv;
				continue;

			case 'F':
				if (ap[1] == '\0') {
					if (argc < 2) {
						(void) fprintf(stderr,
				gettext("Missing field separator\n"));
						return (1);
					}
					ap = mbstowcsdup(argv[1]);
					--argc;
					++argv;
				} else
					++ap;
				strescape(ap);
				strassign(varFS, linebuf, FALLOC,
				    wcslen(linebuf));
				break;

			case 'v': {
				wchar_t *vp;
				wchar_t *arg;

				if (argc < 2) {
					(void) fprintf(stderr,
		gettext("Missing variable assignment\n"));
					return (1);
				}
				arg = mbconvert(argv[1]);
				/*
				 * Ensure the variable expression
				 * is valid (correct form).
				 */
				if (((vp = wcschr(arg, '=')) != NULL) &&
				    isclvar(arg)) {
					*vp = '\0';
					strescape(vp+1);
					strassign(vlook(arg), linebuf,
					    FALLOC|FSENSE,
					    wcslen(linebuf));
					*vp = '=';
				} else {
					(void) fprintf(stderr, gettext(
					    "Invalid form for variable "
					    "assignment: %S\n"), arg);
					return (1);
				}
				--argc;
				++argv;
				continue;
			}

			default:
				(void) fprintf(stderr,
				gettext("Unknown option \"-%S\"\n"), ap);
				return (usage());
			}
			break;
		}
		if (save_ptr)
			free(save_ptr);
	}
	if (progfilep == &progfiles[0]) {
		if (argc < 2)
			return (usage());
		filename = "[command line]";	/* BUG: NEEDS TRANSLATION */
		progptr = mbstowcsdup(argv[1]);
		proglen = wcslen(progptr);
		--argc;
		++argv;
	}

	argv[0] = cmd;

	awkarginit(argc, argv);

	/* running = 0; */
	(void) yyparse();

	lineno = 0;
	/*
	 * Ok, done parsing, so now activate the rest of the nls stuff, set
	 * the radix character.
	 */
	(void) setlocale(LC_ALL, "");
	radixpoint = *localeconv()->decimal_point;
	awk();
	/* NOTREACHED */
	return (0);
}

/*
 * Do initial setup of buffers, etc.
 * This must be called before most processing
 * and especially before lexical analysis.
 * Variables initialised here will be overruled by command
 * line parameter initialisation.
 */
static void
awkvarinit()
{
	NODE *np;

	(void) setvbuf(stderr, NULL, _IONBF, 0);

	if ((NIOSTREAM = sysconf(_SC_OPEN_MAX) - 4) <= 0) {
		(void) fprintf(stderr,
	gettext("not enough available file descriptors"));
		exit(1);
	}
	ofiles = (OFILE *)emalloc(sizeof (OFILE)*NIOSTREAM);
#ifdef A_ZERO_POINTERS
	(void) memset((wchar_t *)ofiles, 0, sizeof (OFILE) * NIOSTREAM);
#else
	{
		/* initialize file descriptor table */
		OFILE *fp;
		for (fp = ofiles; fp < &ofiles[NIOSTREAM]; fp += 1) {
			fp->f_fp = FNULL;
					fp->f_mode = 0;
					fp->f_name = (char *)0;
		}
	}
#endif
	constant = intnode((INT)0);

	const0 = intnode((INT)0);
	const1 = intnode((INT)1);
	constundef = emptynode(CONSTANT, 0);
	constundef->n_flags = FSTRING|FVINT;
	constundef->n_string = _null;
	constundef->n_strlen = 0;
	inc_oper = emptynode(ADD, 0);
	inc_oper->n_right = const1;
	asn_oper = emptynode(ADD, 0);
	field0 = node(FIELD, const0, NNULL);

	{
		RESFUNC near*rp;

		for (rp = &resfuncs[0]; rp->rf_name != (LOCCHARP)NULL; ++rp) {
			np = finstall(rp->rf_name, rp->rf_func, rp->rf_type);
		}
	}
	{
		RESERVED near*rp;

		for (rp = &reserved[0]; rp->r_name != (LOCCHARP)NULL; ++rp) {
			switch (rp->r_type) {
			case SVAR:
			case VAR:
				running = 1;
				np = vlook(rp->r_name);
				if (rp->r_type == SVAR)
					np->n_flags |= FSPECIAL;
				if (rp->r_svalue != NULL)
					strassign(np, rp->r_svalue, FSTATIC,
					    (size_t)rp->r_ivalue);
				else {
					constant->n_int = rp->r_ivalue;
					(void) assign(np, constant);
				}
				running = 0;
				break;

			case KEYWORD:
				kinstall(rp->r_name, (int)rp->r_ivalue);
				break;
			}
		}
	}

	varNR = vlook(s_NR);
	varFNR = vlook(s_FNR);
	varNF = vlook(s_NF);
	varOFMT = vlook(s_OFMT);
	varCONVFMT = vlook(s_CONVFMT);
	varOFS = vlook(s_OFS);
	varORS = vlook(s_ORS);
	varRS = vlook(s_RS);
	varFS = vlook(s_FS);
	varARGC = vlook(s_ARGC);
	varSUBSEP = vlook(s_SUBSEP);
	varENVIRON = vlook(s_ENVIRON);
	varFILENAME = vlook(s_FILENAME);
	varSYMTAB = vlook(s_SYMTAB);
	incNR = node(ASG, varNR, node(ADD, varNR, const1));
	incFNR = node(ASG, varFNR, node(ADD, varFNR, const1));
	clrFNR = node(ASG, varFNR, const0);
}

/*
 * Initialise awk ARGC, ARGV variables.
 */
static void
awkarginit(int ac, char **av)
{
	int i;
	wchar_t *cp;

	ARGVsubi = node(INDEX, vlook(s_ARGV), constant);
	running = 1;
	constant->n_int = ac;
	(void) assign(varARGC, constant);
	for (i = 0; i < ac; ++i) {
		cp = mbstowcsdup(av[i]);
		constant->n_int = i;
		strassign(exprreduce(ARGVsubi), cp,
		    FSTATIC|FSENSE, wcslen(cp));
	}
	running = 0;
}

/*
 * Clean up when done parsing a function.
 * All formal parameters, because of a deal (funparm) in
 * yylex, get put into the symbol table in front of any
 * global variable of the same name.  When the entire
 * function is parsed, remove these formal dummy nodes
 * from the symbol table but retain the nodes because
 * the generated tree points at them.
 */
void
uexit(NODE *np)
{
	NODE *formal;

	while ((formal = getlist(&np)) != NNULL)
		delsymtab(formal, 0);
}

/*
 * The lexical analyzer.
 */
int
yylex()
{
	wint_t c, c1;
	int i;
	static int savetoken = 0;
	static int wasfield;
	static int isfuncdef;
	static int nbrace, nparen, nbracket;
	static struct ctosymstruct {
		wint_t c, sym;
	} ctosym[] = {
		{ '|', BAR },		{ '^', CARAT },
		{ '~', TILDE },		{ '<', LANGLE },
		{ '>', RANGLE },	{ '+', PLUSC },
		{ '-', HYPHEN },	{ '*', STAR },
		{ '/', SLASH },		{ '%', PERCENT },
		{ '!', EXCLAMATION },	{ '$', DOLLAR },
		{ '[', LSQUARE },	{ ']', RSQUARE },
		{ '(', LPAREN },	{ ')', RPAREN },
		{ ';', SEMI },		{ '{', LBRACE },
		{ '}', RBRACE },	{   0, 0 }
	};

	if (savetoken) {
		c = savetoken;
		savetoken = 0;
	} else if (redelim != '\0') {
		c = redelim;
		redelim = 0;
		catterm = 0;
		savetoken = c;
		c = lexlast = lexregexp(c);
		goto out;
	} else while ((c = lexgetc()) != WEOF) {
		if (iswalpha(c) || c == '_') {
			c = lexid(c);
		} else if (iswdigit(c) || c == '.') {
			c = lexnumber(c);
		} else if (isWblank(c)) {
			continue;
		} else switch (c) {
#if DOS || OS2
		case 032:		/* ^Z */
			continue;
#endif

		case '"':
			c = lexstring(c);
			break;

		case '#':
			while ((c = lexgetc()) != '\n' && c != WEOF)
				;
			lexungetc(c);
			continue;

		case '+':
			if ((c1 = lexgetc()) == '+')
				c = INC;
			else if (c1 == '=')
				c = AADD;
			else
				lexungetc(c1);
			break;

		case '-':
			if ((c1 = lexgetc()) == '-')
				c = DEC;
			else if (c1 == '=')
				c = ASUB;
			else
				lexungetc(c1);
			break;

		case '*':
			if ((c1 = lexgetc()) == '=')
				c = AMUL;
			else if (c1 == '*') {
				if ((c1 = lexgetc()) == '=')
					c = AEXP;
				else {
					c = EXP;
					lexungetc(c1);
				}
			} else
				lexungetc(c1);
			break;

		case '^':
			if ((c1 = lexgetc()) == '=') {
				c = AEXP;
			} else {
				c = EXP;
				lexungetc(c1);
			}
			break;

		case '/':
			if ((c1 = lexgetc()) == '=' &&
			    lexlast != RE && lexlast != NRE &&
			    lexlast != ';' && lexlast != '\n' &&
			    lexlast != ',' && lexlast != '(')
				c = ADIV;
			else
				lexungetc(c1);
			break;

		case '%':
			if ((c1 = lexgetc()) == '=')
				c = AREM;
			else
				lexungetc(c1);
			break;

		case '&':
			if ((c1 = lexgetc()) == '&')
				c = AND;
			else
				lexungetc(c1);
			break;

		case '|':
			if ((c1 = lexgetc()) == '|')
				c = OR;
			else {
				lexungetc(c1);
				if (inprint)
					c = PIPE;
			}
			break;

		case '>':
			if ((c1 = lexgetc()) == '=')
				c = GE;
			else if (c1 == '>')
				c = APPEND;
			else {
				lexungetc(c1);
				if (nparen == 0 && inprint)
					c = WRITE;
			}
			break;

		case '<':
			if ((c1 = lexgetc()) == '=')
				c = LE;
			else
				lexungetc(c1);
			break;

		case '!':
			if ((c1 = lexgetc()) == '=')
				c = NE;
			else if (c1 == '~')
				c = NRE;
			else
				lexungetc(c1);
			break;

		case '=':
			if ((c1 = lexgetc()) == '=')
				c = EQ;
			else {
				lexungetc(c1);
				c = ASG;
			}
			break;

		case '\n':
			switch (lexlast) {
			case ')':
				if (catterm || inprint) {
					c = ';';
					break;
				}
			/* FALLTHROUGH */
			case AND:
			case OR:
			case COMMA:
			case '{':
			case ELSE:
			case ';':
			case DO:
				continue;

			case '}':
				if (nbrace != 0)
					continue;
				/* FALLTHROUGH */

			default:
				c = ';';
				break;
			}
			break;

		case ELSE:
			if (lexlast != ';') {
				savetoken = ELSE;
				c = ';';
			}
			break;

		case '(':
			++nparen;
			break;

		case ')':
			if (--nparen < 0)
				awkerr(unbal, "()");
			break;

		case '{':
			nbrace++;
			break;

		case '}':
			if (--nbrace < 0) {
				char brk[3];

				brk[0] = '{';
				brk[1] = '}';
				brk[2] = '\0';
				awkerr(unbal, brk);
			}
			if (lexlast != ';') {
				savetoken = c;
				c = ';';
			}
			break;

		case '[':
			++nbracket;
			break;

		case ']':
			if (--nbracket < 0) {
				char brk[3];

				brk[0] = '[';
				brk[1] = ']';
				brk[2] = '\0';
				awkerr(unbal, brk);
			}
			break;

		case '\\':
			if ((c1 = lexgetc()) == '\n')
				continue;
			lexungetc(c1);
			break;

		case ',':
			c = COMMA;
			break;

		case '?':
			c = QUEST;
			break;

		case ':':
			c = COLON;
			break;

		default:
			if (!iswprint(c))
				awkerr(
				    gettext("invalid character \"%s\""),
				    toprint(c));
			break;
		}
		break;
	}

	switch (c) {
	case ']':
		++catterm;
		break;

	case VAR:
		if (catterm) {
			savetoken = c;
			c = CONCAT;
			catterm = 0;
		} else if (!isfuncdef) {
			if ((c1 = lexgetc()) != '(')
				++catterm;
			lexungetc(c1);
		}
		isfuncdef = 0;
		break;

	case PARM:
	case CONSTANT:
		if (catterm) {
			savetoken = c;
			c = CONCAT;
			catterm = 0;
		} else {
			if (lexlast == '$')
				wasfield = 2;
			++catterm;
		}
		break;

	case INC:
	case DEC:
		if (!catterm || lexlast != CONSTANT || wasfield)
			break;

	/* FALLTHROUGH */
	case UFUNC:
	case FUNC:
	case GETLINE:
	case '!':
	case '$':
	case '(':
		if (catterm) {
			savetoken = c;
			c = CONCAT;
			catterm = 0;
		}
		break;

	case '}':
		if (nbrace == 0)
			savetoken = ';';
	/* FALLTHROUGH */
	case ';':
		inprint = 0;
	/* FALLTHROUGH */
	default:
		if (c == DEFFUNC)
			isfuncdef = 1;
		catterm = 0;
	}
	lexlast = c;
	if (wasfield)
		wasfield--;
	/*
	 * Map character constants to symbolic names.
	 */
	for (i = 0; ctosym[i].c != 0; i++)
		if (c == ctosym[i].c) {
			c = ctosym[i].sym;
			break;
		}
out:
#ifdef DEBUG
	if (dflag)
		(void) printf("%d\n", (int)c);
#endif
	return ((int)c);
}

/*
 * Read a number for the lexical analyzer.
 * Input is the first character of the number.
 * Return value is the lexical type.
 */
static int
lexnumber(wint_t c)
{
	wchar_t *cp;
	int dotfound = 0;
	int efound = 0;
	INT number;

	cp = linebuf;
	do {
		if (iswdigit(c))
			;
		else if (c == '.') {
			if (dotfound++)
				break;
		} else if (c == 'e' || c == 'E') {
			if ((c = lexgetc()) != '-' && c != '+') {
				lexungetc(c);
				c = 'e';
			} else
				*cp++ = 'e';
			if (efound++)
				break;
		} else
			break;
		*cp++ = c;
	} while ((c = lexgetc()) != WEOF);
	*cp = '\0';
	if (dotfound && cp == linebuf+1)
		return (DOT);
	lexungetc(c);
	errno = 0;
	if (!dotfound && !efound &&
	    ((number = wcstol(linebuf, (wchar_t **)0, 10)), errno != ERANGE))
		yylval.node = intnode(number);
	else
		yylval.node = realnode((REAL)wcstod(linebuf, (wchar_t **)0));
	return (CONSTANT);
}

/*
 * Read an identifier.
 * Input is first character of identifier.
 * Return VAR.
 */
static int
lexid(wint_t c)
{
	wchar_t *cp;
	size_t i;
	NODE *np;

	cp = linebuf;
	do {
		*cp++ = c;
		c = lexgetc();
	} while (iswalpha(c) || iswdigit(c) || c == '_');
	*cp = '\0';
	lexungetc(c);
	yylval.node = np = vlook(linebuf);

	switch (np->n_type) {
	case KEYWORD:
		switch (np->n_keywtype) {
		case PRINT:
		case PRINTF:
			++inprint;
			/* FALLTHROUGH */
		default:
			return ((int)np->n_keywtype);
		}
		/* NOTREACHED */

	case ARRAY:
	case VAR:
		/*
		 * If reading the argument list, create a dummy node
		 * for the duration of that function. These variables
		 * can be removed from the symbol table at function end
		 * but they must still exist because the execution tree
		 * knows about them.
		 */
		if (funparm) {
do_funparm:
			np = emptynode(PARM, i = (cp-linebuf));
			np->n_flags = FSTRING;
			np->n_string = _null;
			np->n_strlen = 0;
			(void) memcpy(np->n_name, linebuf,
			    (i+1) * sizeof (wchar_t));
			addsymtab(np);
			yylval.node = np;
		} else if (np == varNF || (np == varFS &&
		    (!doing_begin || begin_getline))) {
			/*
			 * If the user program references NF or sets
			 * FS either outside of a begin block or
			 * in a begin block after a getline then the
			 * input line will be split immediately upon read
			 * rather than when a field is first referenced.
			 */
			needsplit = 1;
		} else if (np == varENVIRON)
			needenviron = 1;
	/* FALLTHROUGH */
	case PARM:
		return (VAR);

	case UFUNC:
		/*
		 * It is ok to redefine functions as parameters
		 */
		if (funparm) goto do_funparm;
	/* FALLTHROUGH */
	case FUNC:
	case GETLINE:
		/*
		 * When a getline is encountered, clear the 'doing_begin' flag.
		 * This will force the 'needsplit' flag to be set, even inside
		 * a begin block, if FS is altered. (See VAR case above)
		 */
		if (doing_begin)
			begin_getline = 1;
		return (np->n_type);
	}
	/* NOTREACHED */
	return (0);
}

/*
 * Read a string for the lexical analyzer.
 * `endc' terminates the string.
 */
static int
lexstring(wint_t endc)
{
	size_t length = lexescape(endc, 0, 0);

	yylval.node = stringnode(linebuf, FALLOC, length);
	return (CONSTANT);
}

/*
 * Read a regular expression.
 */
static int
lexregexp(wint_t endc)
{
	(void) lexescape(endc, 1, 0);
	yylval.node = renode(linebuf);
	return (URE);
}

/*
 * Process a string, converting the escape characters as required by
 * 1003.2. The processed string ends up in the global linebuf[]. This
 * routine also changes the value of 'progfd' - the program file
 * descriptor, so it should be used with some care. It is presently used to
 * process -v (awk1.c) and var=str type arguments (awk2.c, nextrecord()).
 */
void
strescape(wchar_t *str)
{
	progptr = str;
	proglen = wcslen(str) + 1;	/* Include \0 */
	(void) lexescape('\0', 0, 1);
	progptr = NULL;
}

/*
 * Read a string or regular expression, terminated by ``endc'',
 * for lexical analyzer, processing escape sequences.
 * Return string length.
 */
static size_t
lexescape(wint_t endc, int regx, int cmd_line_operand)
{
	static char nlre[256];
	static char nlstr[256];
	static char eofre[256];
	static char eofstr[256];
	int first_time = 1;
	wint_t c;
	wchar_t *cp;
	int n, max;

	if (first_time == 1) {
		(void) strcpy(nlre, gettext("Newline in regular expression\n"));
		(void) strcpy(nlstr, gettext("Newline in string\n"));
		(void) strcpy(eofre, gettext("EOF in regular expression\n"));
		(void) strcpy(eofstr, gettext("EOF in string\n"));
		first_time = 0;
	}

	cp = linebuf;
	while ((c = lexgetc()) != endc) {
		if (c == '\n')
			awkerr(regx ? nlre : nlstr);
		if (c == '\\') {
			switch (c = lexgetc(), c) {
			case '\\':
				if (regx)
					*cp++ = '\\';
				break;

			case '/':
				c = '/';
				break;

			case 'n':
				c = '\n';
				break;

			case 'b':
				c = '\b';
				break;

			case 't':
				c = '\t';
				break;

			case 'r':
				c = '\r';
				break;

			case 'f':
				c = '\f';
				break;

			case 'v':
				c = '\v';
				break;

			case 'a':
				c = (char)0x07;
				break;

			case 'x':
				n = 0;
				while (iswxdigit(c = lexgetc())) {
					if (iswdigit(c))
						c -= '0';
					else if (iswupper(c))
						c -= 'A'-10;
					else
						c -= 'a'-10;
					n = (n<<4) + c;
				}
				lexungetc(c);
				c = n;
				break;

			case '0':
			case '1':
			case '2':
			case '3':
			case '4':
			case '5':
			case '6':
			case '7':
#if 0
/*
 * Posix.2 draft 10 disallows the use of back-referencing - it explicitly
 * requires processing of the octal escapes both in strings and
 * regular expressions. The following code is disabled instead of
 * removed as back-referencing may be reintroduced in a future draft
 * of the standard.
 */
				/*
				 * For regular expressions, we disallow
				 * \ooo to mean octal character, in favour
				 * of back referencing.
				 */
				if (regx) {
					*cp++ = '\\';
					break;
				}
#endif
				max = 3;
				n = 0;
				do {
					n = (n<<3) + c-'0';
					if ((c = lexgetc()) > '7' || c < '0')
						break;
				} while (--max);
				lexungetc(c);
				/*
				 * an octal escape sequence must have at least
				 * 2 digits after the backslash, otherwise
				 * it gets passed straight thru for possible
				 * use in backreferencing.
				 */
				if (max == 3) {
					*cp++ = '\\';
					n += '0';
				}
				c = n;
				break;

			case '\n':
				continue;

			default:
				if (c != endc || cmd_line_operand) {
					*cp++ = '\\';
					if (c == endc)
						lexungetc(c);
				}
			}
		}
		if (c == WEOF)
			awkerr(regx ? eofre : eofstr);
		*cp++ = c;
	}
	*cp = '\0';
	return (cp - linebuf);
}

/*
 * Build a regular expression NODE.
 * Argument is the string holding the expression.
 */
NODE *
renode(wchar_t *s)
{
	NODE *np;
	int n;

	np = emptynode(RE, 0);
	np->n_left = np->n_right = NNULL;
	if ((n = REGWCOMP(&np->n_regexp, s)) != REG_OK) {
		int m;
		char *p;

		m = REGWERROR(n, np->n_regexp, NULL, 0);
		p = (char *)emalloc(m);
		REGWERROR(n, np->n_regexp, p, m);
		awkerr("/%S/: %s", s, p);
	}
	return (np);
}
/*
 * Get a character for the lexical analyser routine.
 */
static wint_t
lexgetc()
{
	wint_t c;
	static char **files = &progfiles[0];

	if (progfp != FNULL && (c = fgetwc(progfp)) != WEOF)
		;
	else {
		if (progptr != NULL) {
			if (proglen-- <= 0)
				c = WEOF;
			else
				c = *progptr++;
		} else {
			if (progfp != FNULL) {
				if (progfp != stdin)
					(void) fclose(progfp);
				else
					clearerr(progfp);
				progfp = FNULL;
			}
			if (files < progfilep) {
				filename = *files++;
				lineno = 1;
				if (filename[0] == '-' && filename[1] == '\0')
					progfp = stdin;
				else if ((progfp = fopen(filename, r))
				    == FNULL) {
					(void) fprintf(stderr,
				gettext("script file \"%s\""), filename);
					exit(1);
				}
				c = fgetwc(progfp);
			}
		}
	}
	if (c == '\n')
		++lineno;
	if (conptr >= &context[NCONTEXT])
		conptr = &context[0];
	if (c != WEOF)
		*conptr++ = c;
	return (c);
}

/*
 * Return a character for lexical analyser.
 * Only one returned character is (not enforced) legitimite.
 */
static void
lexungetc(wint_t c)
{
	if (c == '\n')
		--lineno;
	if (c != WEOF) {
		if (conptr == &context[0])
			conptr = &context[NCONTEXT];
		*--conptr = '\0';
	}
	if (progfp != FNULL) {
		(void) ungetwc(c, progfp);
		return;
	}
	if (c == WEOF)
		return;
	*--progptr = c;
	proglen++;
}

/*
 * Syntax errors during parsing.
 */
int
yyerror(const char *s, ...)
{
	if (lexlast == FUNC || lexlast == GETLINE || lexlast == KEYWORD)
		if (lexlast == KEYWORD)
			awkerr(gettext("inadmissible use of reserved keyword"));
		else
			awkerr(gettext("attempt to redefine builtin function"));
	awkerr(s);
	return (0);
}

/*
 * Error routine for all awk errors.
 */
void
awkerr(const char *fmt, ...)
{
	va_list args;

	va_start(args, fmt);
	awkierr(0, fmt, args);
	va_end(args);
}

/*
 * Error routine like "awkerr" except that it prints out
 * a message that includes an errno-specific indication.
 */
void
awkperr(const char *fmt, ...)
{
	va_list args;

	va_start(args, fmt);
	awkierr(1, fmt, args);
	va_end(args);
}

/*
 * Common internal routine for awkerr, awkperr
 */
static void
awkierr(int perr, const char *fmt, va_list ap)
{
	static char sep1[] = "\n>>>\t";
	static char sep2[] = "\t<<<";
	int saveerr = errno;

	(void) fprintf(stderr, "%s: ", _cmdname);
	if (running) {
		(void) fprintf(stderr, gettext("line %u ("),
		    curnode == NNULL ? 0 : curnode->n_lineno);
		if (phase == 0)
			(void) fprintf(stderr, "NR=%lld): ",
			    (INT)exprint(varNR));
		else
			(void) fprintf(stderr, "%s): ",
			    phase == BEGIN ? s_BEGIN : s_END);
	} else if (lineno != 0) {
		(void) fprintf(stderr, gettext("file \"%s\": "), filename);
		(void) fprintf(stderr, gettext("line %u: "), lineno);
	}
	(void) vfprintf(stderr, gettext(fmt), ap);
	if (perr == 1)
		(void) fprintf(stderr, ": %s", strerror(saveerr));
	if (perr != 2 && !running) {
		wchar_t *cp;
		int n;
		int c;

		(void) fprintf(stderr, gettext("  Context is:%s"), sep1);
		cp = conptr;
		n = NCONTEXT;
		do {
			if (cp >= &context[NCONTEXT])
				cp = &context[0];
			if ((c = *cp++) != '\0')
				(void) fputs(c == '\n' ? sep1 : toprint(c),
				    stderr);
		} while (--n != 0);
		(void) fputs(sep2, stderr);
	}
	(void) fprintf(stderr, "\n");
	exit(1);
}

wchar_t *
emalloc(unsigned n)
{
	wchar_t *cp;

	if ((cp = malloc(n)) == NULL)
		awkerr(nomem);
	return (cp);
}

wchar_t *
erealloc(wchar_t *p, unsigned n)
{
	wchar_t *cp;

	if ((cp = realloc(p, n)) == NULL)
		awkerr(nomem);
	return (cp);
}


/*
 * usage message for awk
 */
static int
usage()
{
	(void) fprintf(stderr, gettext(
"Usage:	awk [-F ERE] [-v var=val] 'program' [var=val ...] [file ...]\n"
"	awk [-F ERE] -f progfile ... [-v var=val] [var=val ...] [file ...]\n"));
	return (2);
}


static wchar_t *
mbconvert(char *str)
{
	static wchar_t *op = 0;

	if (op != 0)
		free(op);
	return (op = mbstowcsdup(str));
}

char *
mbunconvert(wchar_t *str)
{
	static char *op = 0;

	if (op != 0)
		free(op);
	return (op = wcstombsdup(str));
}

/*
 * Solaris port - following functions are typical MKS functions written
 * to work for Solaris.
 */

wchar_t *
mbstowcsdup(char *s)
{
	int n;
	wchar_t *w;

	n = strlen(s) + 1;
	if ((w = (wchar_t *)malloc(n * sizeof (wchar_t))) == NULL)
		return (NULL);

	if (mbstowcs(w, s, n) == (size_t)-1)
		return (NULL);
	return (w);

}

char *
wcstombsdup(wchar_t *w)
{
	int n;
	char *mb;

	/* Fetch memory for worst case string length */
	n = wslen(w) + 1;
	n *= MB_CUR_MAX;
	if ((mb = (char *)malloc(n)) == NULL) {
		return (NULL);
	}

	/* Convert the string */
	if ((n = wcstombs(mb, w, n)) == -1) {
		int saverr = errno;

		free(mb);
		errno = saverr;
		return (0);
	}

	/* Shrink the string down */
	if ((mb = (char *)realloc(mb, strlen(mb)+1)) == NULL)  {
		return (NULL);
	}
	return (mb);
}

/*
 * The upe_ctrls[] table contains the printable 'control-sequences' for the
 * character values 0..31 and 127.  The first entry is for value 127, thus the
 * entries for the remaining character values are from 1..32.
 */
static const char *const upe_ctrls[] =
{
	"^?",
	"^@",  "^A",  "^B",  "^C",  "^D",  "^E",  "^F",  "^G",
	"^H",  "^I",  "^J",  "^K",  "^L",  "^M",  "^N",  "^O",
	"^P",  "^Q",  "^R",  "^S",  "^T",  "^U",  "^V",  "^W",
	"^X",  "^Y",  "^Z",  "^[",  "^\\", "^]",  "^^",  "^_"
};


/*
 * Return a printable string corresponding to the given character value.  If
 * the character is printable, simply return it as the string.  If it is in
 * the range specified by table 5-101 in the UPE, return the corresponding
 * string.  Otherwise, return an octal escape sequence.
 */
static const char *
toprint(wchar_t c)
{
	int n, len;
	unsigned char *ptr;
	static char mbch[MB_LEN_MAX+1];
	static char buf[5 * MB_LEN_MAX + 1];

	if ((n = wctomb(mbch, c)) == -1) {
		/* Should never happen */
		(void) sprintf(buf, "\\%x", c);
		return (buf);
	}
	mbch[n] = '\0';
	if (iswprint(c)) {
		return (mbch);
	} else if (c == 127) {
		return (upe_ctrls[0]);
	} else if (c < 32) {
		/* Print as in Table 5-101 in the UPE */
		return (upe_ctrls[c+1]);
	} else {
		/* Print as an octal escape sequence */
		for (len = 0, ptr = (unsigned char *) mbch; 0 < n; --n, ++ptr)
			len += sprintf(buf+len, "\\%03o", *ptr);
	}
	return (buf);
}

static int
wcoff(const wchar_t *astring, const int off)
{
	const wchar_t *s = astring;
	int c = 0;
	char mb[MB_LEN_MAX];

	while (c < off) {
		int n;
		if ((n = wctomb(mb, *s)) == 0)
			break;
		if (n == -1)
			n = 1;
		c += n;
		s++;
	}

	return (s - astring);
}

#define	NREGHASH	64
#define	NREGHOLD	1024	/* max number unused entries */

static int	nregunref;

struct reghashq {
	struct qelem hq;
	struct regcache *regcachep;
};

struct regcache {
	struct qelem	lq;
	wchar_t	*pattern;
	regex_t	re;
	int	refcnt;
	struct reghashq	hash;
};

static struct qelem reghash[NREGHASH], reglink;

/*
 * Generate a hash value of the given wchar string.
 * The hashing method is similar to what Java does for strings.
 */
static uint_t
regtxthash(const wchar_t *str)
{
	int k = 0;

	while (*str != L'\0')
		k = (31 * k) + *str++;

	k += ~(k << 9);
	k ^=  (k >> 14);
	k +=  (k << 4);
	k ^=  (k >> 10);

	return (k % NREGHASH);
}

int
int_regwcomp(REGEXP *r, const wchar_t *pattern)
{
	regex_t re;
	char *mbpattern;
	int ret;
	uint_t key;
	struct qelem *qp;
	struct regcache *rcp;

	key = regtxthash(pattern);
	for (qp = reghash[key].q_forw; qp != NULL; qp = qp->q_forw) {
		rcp = ((struct reghashq *)qp)->regcachep;
		if (*rcp->pattern == *pattern &&
		    wcscmp(rcp->pattern, pattern) == 0)
			break;
	}
	if (qp != NULL) {
		/* update link. put this one at the beginning */
		if (rcp != (struct regcache *)reglink.q_forw) {
			remque(&rcp->lq);
			insque(&rcp->lq, &reglink);
		}
		if (rcp->refcnt == 0)
			nregunref--;	/* no longer unref'ed */
		rcp->refcnt++;
		*(struct regcache **)r = rcp;
		return (REG_OK);
	}

	if ((mbpattern = wcstombsdup((wchar_t *)pattern)) == NULL)
		return (REG_ESPACE);

	ret = regcomp(&re, mbpattern, REG_EXTENDED);

	free(mbpattern);

	if (ret != REG_OK)
		return (ret);

	if ((rcp = malloc(sizeof (struct regcache))) == NULL)
		return (REG_ESPACE);
	rcp->re = re;
	if ((rcp->pattern = wsdup(pattern)) == NULL) {
		regfree(&re);
		free(rcp);
		return (REG_ESPACE);
	}
	rcp->refcnt = 1;
	insque(&rcp->lq, &reglink);
	insque(&rcp->hash.hq, &reghash[key]);
	rcp->hash.regcachep = rcp;

	*(struct regcache **)r = rcp;
	return (ret);
}

void
int_regwfree(REGEXP r)
{
	int	cnt;
	struct qelem *qp, *nqp;
	struct regcache *rcp;

	rcp = (struct regcache *)r;

	if (--rcp->refcnt != 0)
		return;

	/* this cache has no reference */
	if (++nregunref < NREGHOLD)
		return;

	/*
	 * We've got too much unref'ed regex. Free half of least
	 * used regex.
	 */
	cnt = 0;
	for (qp = reglink.q_forw; qp != NULL; qp = nqp) {
		nqp = qp->q_forw;
		rcp = (struct regcache *)qp;
		if (rcp->refcnt != 0)
			continue;

		/* free half of them */
		if (++cnt < (NREGHOLD / 2))
			continue;

		/* detach and free */
		remque(&rcp->lq);
		remque(&rcp->hash.hq);

		/* free up */
		free(rcp->pattern);
		regfree(&rcp->re);
		free(rcp);

		nregunref--;
	}
}

size_t
int_regwerror(int errcode, REGEXP r, char *errbuf, size_t bufsiz)
{
	struct regcache *rcp;

	rcp = (struct regcache *)r;
	return (regerror(errcode, &rcp->re, errbuf, bufsiz));
}

int
int_regwexec(REGEXP r,		/* compiled RE */
    const wchar_t *astring,	/* subject string */
    size_t nsub,		/* number of subexpressions */
    int_regwmatch_t *sub,	/* subexpression pointers */
    int flags)
{
	char *mbs;
	regmatch_t *mbsub = NULL;
	int i;
	struct regcache *rcp;

	if ((mbs = wcstombsdup((wchar_t *)astring)) == NULL)
		return (REG_ESPACE);

	if (nsub > 0 && sub) {
		if ((mbsub = malloc(nsub * sizeof (regmatch_t))) == NULL)
			return (REG_ESPACE);
	}

	rcp = (struct regcache *)r;

	i = regexec(&rcp->re, mbs, nsub, mbsub, flags);

	/* Now, adjust the pointers/counts in sub */
	if (i == REG_OK && nsub > 0 && mbsub) {
		int j, k;

		for (j = 0; j < nsub; j++) {
			regmatch_t *ms = &mbsub[j];
			int_regwmatch_t *ws = &sub[j];

			if ((k = ms->rm_so) >= 0) {
				ws->rm_so = wcoff(astring, k);
				ws->rm_sp = astring + ws->rm_so;
			}
			if ((k = ms->rm_eo) >= 0) {
				ws->rm_eo = wcoff(astring, k);
				ws->rm_ep = astring + ws->rm_eo;
			}
		}
	}

	free(mbs);
	if (mbsub)
		free(mbsub);
	return (i);
}

int
int_regwdosuba(REGEXP rp,	/* compiled RE: Pattern */
    const wchar_t *rpl,		/* replacement string: /rpl/ */
    const wchar_t *src,		/* source string */
    wchar_t **dstp,		/* destination string */
    int len,			/* destination length */
    int *globp)		/* IN: occurence, 0 for all; OUT: substitutions */
{
	wchar_t *dst, *odst;
	const wchar_t *ip, *xp;
	wchar_t *op;
	int i;
	wchar_t c;
	int glob, iglob = *globp, oglob = 0;
#define	NSUB	10
	int_regwmatch_t rm[NSUB], *rmp;
	int flags;
	wchar_t *end;
	int regerr;

/* handle overflow of dst. we need "i" more bytes */
#ifdef OVERFLOW
#undef OVERFLOW
#define	OVERFLOW(i) { \
		int pos = op - dst; \
		dst = (wchar_t *)realloc(odst = dst, \
			(len += len + i) * sizeof (wchar_t)); \
		if (dst == NULL) \
			goto nospace; \
		op = dst + pos; \
		end = dst + len; \
	}
#endif

	*dstp = dst = (wchar_t *)malloc(len * sizeof (wchar_t));
	if (dst == NULL)
		return (REG_ESPACE);

	if (rp == NULL || rpl == NULL || src == NULL || dst ==  NULL)
		return (REG_EFATAL);

	glob = 0;	/* match count */
	ip = src;	/* source position */
	op = dst;	/* destination position */
	end = dst + len;

	flags = 0;
	while ((regerr = int_regwexec(rp, ip, NSUB, rm, flags)) == REG_OK) {
		/* Copy text preceding match */
		if (op + (i = rm[0].rm_sp - ip) >= end)
			OVERFLOW(i)
		while (i--)
			*op++ = *ip++;

		if (iglob == 0 || ++glob == iglob) {
			oglob++;
			xp = rpl;		/* do substitute */
		} else
			xp = L"&";		/* preserve text */

		/* Perform replacement of matched substing */
		while ((c = *xp++) != '\0') {
			rmp = NULL;
			if (c == '&')
				rmp = &rm[0];
			else if (c == '\\') {
				if ('0' <= *xp && *xp <= '9')
					rmp = &rm[*xp++ - '0'];
				else if (*xp != '\0')
					c = *xp++;
			}

			if (rmp ==  NULL) {	/* Ordinary character. */
				*op++ = c;
				if (op >= end)
					OVERFLOW(1)
			} else if (rmp->rm_sp != NULL && rmp->rm_ep != NULL) {
				ip = rmp->rm_sp;
				if (op + (i = rmp->rm_ep - rmp->rm_sp) >= end)
					OVERFLOW(i)
				while (i--)
					*op++ = *ip++;
			}
		}

		ip = rm[0].rm_ep;
		if (*ip == '\0')	/* If at end break */
			break;
		else if (rm[0].rm_sp == rm[0].rm_ep) {
			/* If empty match copy next char */
			*op++ = *ip++;
			if (op >= end)
				OVERFLOW(1)
		}
		flags = REG_NOTBOL;
	}

	if (regerr != REG_OK && regerr != REG_NOMATCH)
		return (regerr);

	/* Copy rest of text */
	if (op + (i =  wcslen(ip)) >= end)
		OVERFLOW(i)
	while (i--)
		*op++ = *ip++;
	*op++ = '\0';

	if ((*dstp = dst = (wchar_t *)realloc(odst = dst,
	    sizeof (wchar_t) * (size_t)(op - dst))) == NULL) {
nospace:
		free(odst);
		return (REG_ESPACE);
	}

	*globp = oglob;

	return ((oglob == 0) ? REG_NOMATCH : REG_OK);
}