xref: /illumos-gate/usr/src/cmd/awk_xpg4/awk1.c (revision 814a60b13c0ad90e5d2edfd29a7a84bbf416cc1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * awk -- mainline, yylex, etc.
29  *
30  * Copyright 1986, 1994 by Mortice Kern Systems Inc.  All rights reserved.
31  *
32  * Based on MKS awk(1) ported to be /usr/xpg4/bin/awk with POSIX/XCU4 changes
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 #include "awk.h"
38 #include "y.tab.h"
39 #include <stdarg.h>
40 #include <unistd.h>
41 #include <locale.h>
42 
43 static char	*progfiles[NPFILE];	/* Programmes files for yylex */
44 static char	**progfilep = &progfiles[0]; /* Pointer to last file */
45 static wchar_t	*progptr;		/* In-memory programme */
46 static int	proglen;		/* Length of progptr */
47 static wchar_t	context[NCONTEXT];	/* Circular buffer of context */
48 static wchar_t	*conptr = &context[0];	/* context ptr */
49 static FILE	*progfp;		/* Stdio stream for programme */
50 static char	*filename;
51 #ifdef	DEBUG
52 static int	dflag;
53 #endif
54 
55 #define AWK_EXEC_MAGIC	"<MKS AWKC>"
56 #define LEN_EXEC_MAGIC	10
57 
58 static char	unbal[] = "unbalanced E char";
59 
60 static void	awkarginit(int c, char **av);
61 static int	lexid(wint_t c);
62 static int	lexnumber(wint_t c);
63 static int	lexstring(wint_t endc);
64 static int	lexregexp(register wint_t endc);
65 
66 static void	awkvarinit(void);
67 static wint_t	lexgetc(void);
68 static void	lexungetc(wint_t c);
69 static size_t	lexescape(wint_t endc, int regx, int cmd_line_operand);
70 static void	awkierr(int perr, char *fmt, va_list ap);
71 static int	usage(void);
72 void		strescape(wchar_t *str);
73 static const char      *toprint(wint_t);
74 char *_cmdname;
75 static wchar_t *mbconvert(char *str);
76 
77 
78 /*
79  * mainline for awk
80  */
81 int
82 main(int argc, char *argv[])
83 {
84 	register wchar_t *ap;
85 	register char *cmd;
86 
87 	cmd = argv[0];
88 	_cmdname = cmd;
89 
90 	linebuf = emalloc(NLINE * sizeof(wchar_t));
91 
92 	/*l
93 	 * At this point only messaging should be internationalized.
94 	 * numbers are still scanned as in the Posix locale.
95 	 */
96 	(void) setlocale(LC_ALL,"");
97 	(void) setlocale(LC_NUMERIC,"C");
98 #if !defined(TEXT_DOMAIN)
99 #define	TEXT_DOMAIN	"SYS_TEST"
100 #endif
101 	(void) textdomain(TEXT_DOMAIN);
102 
103 	awkvarinit();
104 	/*running = 1;*/
105 	while (argc>1 && *argv[1]=='-') {
106 		void *save_ptr = NULL;
107 		ap = mbstowcsdup(&argv[1][1]);
108 		if (ap == NULL)
109 			break;
110 		if (*ap == '\0') {
111 			free(ap);
112 			break;
113 		}
114 		save_ptr = (void *) ap;
115 		++argv;
116 		--argc;
117 		if (*ap=='-' && ap[1]=='\0')
118 			break;
119 		for ( ; *ap != '\0'; ++ap) {
120 			switch (*ap) {
121 #ifdef DEBUG
122 			case 'd':
123 				dflag = 1;
124 				continue;
125 
126 #endif
127 			case 'f':
128 				if (argc < 2) {
129 					(void) fprintf(stderr,
130 				gettext("Missing script file\n"));
131 					return (1);
132 				}
133 				*progfilep++ = argv[1];
134 				--argc;
135 				++argv;
136 				continue;
137 
138 			case 'F':
139 				if (ap[1] == '\0') {
140 					if (argc < 2) {
141 						(void) fprintf(stderr,
142 				gettext("Missing field separator\n"));
143 						return (1);
144 					}
145 					ap = mbstowcsdup(argv[1]);
146 					--argc;
147 					++argv;
148 				} else
149 					++ap;
150 				strescape(ap);
151 				strassign(varFS, linebuf, FALLOC,
152 					wcslen(linebuf));
153 				break;
154 
155 			case 'v': {
156 				register wchar_t *vp;
157 				register wchar_t *arg;
158 
159 				if (argc < 2) {
160 					(void) fprintf(stderr,
161 		gettext("Missing variable assignment\n"));
162 					return (1);
163 				}
164 				arg = mbconvert(argv[1]);
165 				if ((vp = wcschr(arg, '=')) != NULL) {
166 					*vp = '\0';
167 					strescape(vp+1);
168 					strassign(vlook(arg), linebuf,
169 					    FALLOC|FSENSE, wcslen(linebuf));
170 					*vp = '=';
171 				}
172 				--argc;
173 				++argv;
174 				continue;
175 			}
176 
177 			default:
178 				(void) fprintf(stderr,
179 				gettext("Unknown option \"-%S\"\n"), ap);
180 				return (usage());
181 			}
182 			break;
183 		}
184 		if (save_ptr)
185 			free(save_ptr);
186 	}
187 	if (progfilep == &progfiles[0]) {
188 		if (argc < 2)
189 			return (usage());
190 		filename = "[command line]";	/* BUG: NEEDS TRANSLATION */
191 		progptr = mbstowcsdup(argv[1]);
192 		proglen = wcslen(progptr);
193 		--argc;
194 		++argv;
195 	}
196 
197 	argv[0] = cmd;
198 
199 	awkarginit(argc, argv);
200 
201 	/*running = 0;*/
202 	(void)yyparse();
203 
204 	lineno = 0;
205 	/*
206 	 * Ok, done parsing, so now activate the rest of the nls stuff, set
207 	 * the radix character.
208 	 */
209 	(void) setlocale(LC_ALL,"");
210 	radixpoint = *localeconv()->decimal_point;
211 	awk();
212 	/* NOTREACHED */
213 	return (0);
214 }
215 
216 /*
217  * Do initial setup of buffers, etc.
218  * This must be called before most processing
219  * and especially before lexical analysis.
220  * Variables initialised here will be overruled by command
221  * line parameter initialisation.
222  */
223 static void
224 awkvarinit()
225 {
226 	register NODE *np;
227 
228 	(void) setvbuf(stderr, NULL, _IONBF, 0);
229 
230 	if ((NIOSTREAM = sysconf(_SC_OPEN_MAX) - 4) <= 0) {
231 		(void) fprintf(stderr,
232 	gettext("not enough available file descriptors"));
233 		exit(1);
234 	}
235 	ofiles = (OFILE *) emalloc(sizeof(OFILE)*NIOSTREAM);
236 #ifdef A_ZERO_POINTERS
237 	(void) memset((wchar_t *) ofiles, 0, sizeof(OFILE) * NIOSTREAM);
238 #else
239 	{
240 	    /* initialize file descriptor table */
241 	    OFILE *fp;
242 	    for (fp = ofiles; fp < &ofiles[NIOSTREAM]; fp += 1) {
243 		fp->f_fp = FNULL;
244 		fp->f_mode = 0;
245 		fp->f_name = (char *)0;
246 	    }
247 	}
248 #endif
249 	constant = intnode((INT)0);
250 
251 	const0 = intnode((INT)0);
252 	const1 = intnode((INT)1);
253 	constundef = emptynode(CONSTANT, 0);
254 	constundef->n_flags = FSTRING|FVINT;
255 	constundef->n_string = _null;
256 	constundef->n_strlen = 0;
257 	inc_oper = emptynode(ADD, 0);
258 	inc_oper->n_right = const1;
259 	asn_oper = emptynode(ADD, 0);
260 	field0 = node(FIELD, const0, NNULL);
261 
262 	{
263 		register RESFUNC near*rp;
264 
265 		for (rp = &resfuncs[0]; rp->rf_name != (LOCCHARP)NULL; ++rp) {
266 			np = finstall(rp->rf_name, rp->rf_func, rp->rf_type);
267 		}
268 	}
269 	{
270 		register RESERVED near*rp;
271 
272 		for (rp = &reserved[0]; rp->r_name != (LOCCHARP)NULL; ++rp) {
273 			switch (rp->r_type) {
274 			case SVAR:
275 			case VAR:
276 				running = 1;
277 				np = vlook(rp->r_name);
278 				if (rp->r_type == SVAR)
279 					np->n_flags |= FSPECIAL;
280 				if (rp->r_svalue != NULL)
281 					strassign(np, rp->r_svalue, FSTATIC,
282 					    (size_t)rp->r_ivalue);
283 				else {
284 					constant->n_int = rp->r_ivalue;
285 					(void)assign(np, constant);
286 				}
287 				running = 0;
288 				break;
289 
290 			case KEYWORD:
291 				kinstall(rp->r_name, (int)rp->r_ivalue);
292 				break;
293 			}
294 		}
295 	}
296 
297 	varNR = vlook(s_NR);
298 	varFNR = vlook(s_FNR);
299 	varNF = vlook(s_NF);
300 	varOFMT = vlook(s_OFMT);
301 	varCONVFMT = vlook(s_CONVFMT);
302 	varOFS = vlook(s_OFS);
303 	varORS = vlook(s_ORS);
304 	varRS = vlook(s_RS);
305 	varFS = vlook(s_FS);
306 	varARGC = vlook(s_ARGC);
307 	varSUBSEP = vlook(s_SUBSEP);
308 	varENVIRON = vlook(s_ENVIRON);
309 	varFILENAME = vlook(s_FILENAME);
310 	varSYMTAB = vlook(s_SYMTAB);
311 	incNR = node(ASG, varNR, node(ADD, varNR, const1));
312 	incFNR = node(ASG, varFNR, node(ADD, varFNR, const1));
313 	clrFNR = node(ASG, varFNR, const0);
314 }
315 
316 /*
317  * Initialise awk ARGC, ARGV variables.
318  */
319 static void
320 awkarginit(int ac, char **av)
321 {
322 	register int i;
323 	register wchar_t *cp;
324 
325 	ARGVsubi = node(INDEX, vlook(s_ARGV), constant);
326 	running = 1;
327 	constant->n_int = ac;
328 	(void)assign(varARGC, constant);
329 	for (i = 0; i < ac; ++i) {
330 		cp = mbstowcsdup(av[i]);
331 		constant->n_int = i;
332 		strassign(exprreduce(ARGVsubi), cp,
333 		    FSTATIC|FSENSE, wcslen(cp));
334 	}
335 	running = 0;
336 }
337 
338 /*
339  * Clean up when done parsing a function.
340  * All formal parameters, because of a deal (funparm) in
341  * yylex, get put into the symbol table in front of any
342  * global variable of the same name.  When the entire
343  * function is parsed, remove these formal dummy nodes
344  * from the symbol table but retain the nodes because
345  * the generated tree points at them.
346  */
347 void
348 uexit(NODE *np)
349 {
350 	register NODE *formal;
351 
352 	while ((formal = getlist(&np)) != NNULL)
353 		delsymtab(formal, 0);
354 }
355 
356 /*
357  * The lexical analyzer.
358  */
359 int
360 yylex()
361 #ifdef	DEBUG
362 {
363 	register int l;
364 
365 	l = yyhex();
366 	if (dflag)
367 		(void) printf("%d\n", l);
368 	return (l);
369 }
370 yyhex()
371 #endif
372 {
373 	register wint_t c, c1;
374 	int i;
375 	static int savetoken = 0;
376 	static int wasfield;
377 	static int isfuncdef;
378 	static int nbrace, nparen, nbracket;
379 	static struct ctosymstruct {
380 		wint_t c, sym;
381 	} ctosym[] = {
382 		{ '|', BAR },		{ '^', CARAT },
383 	  	{ '~', TILDE },		{ '<', LANGLE },
384 	  	{ '>', RANGLE },	{ '+', PLUSC },
385 	  	{ '-', HYPHEN },	{ '*', STAR },
386 	  	{ '/', SLASH },		{ '%', PERCENT },
387 	  	{ '!', EXCLAMATION },	{ '$', DOLLAR },
388 	  	{ '[', LSQUARE },	{ ']', RSQUARE },
389 		{ '(', LPAREN },	{ ')', RPAREN },
390 		{ ';', SEMI },		{ '{', LBRACE },
391 		{ '}', RBRACE },	{   0, 0 }
392 	};
393 
394 	if (savetoken) {
395 		c = savetoken;
396 		savetoken = 0;
397 	} else if (redelim != '\0') {
398 		c = redelim;
399 		redelim = 0;
400 		catterm = 0;
401 		savetoken = c;
402 		return (lexlast = lexregexp(c));
403 	} else while ((c = lexgetc()) != WEOF) {
404 		if (iswalpha(c) || c=='_') {
405 			c = lexid(c);
406 		} else if (iswdigit(c) || c=='.') {
407 			c = lexnumber(c);
408 		} else if (isWblank(c)) {
409 			continue;
410 		} else switch (c) {
411 #if DOS || OS2
412 		case 032:		/* ^Z */
413 			continue;
414 #endif
415 
416 		case '"':
417 			c = lexstring(c);
418 			break;
419 
420 		case '#':
421 			while ((c = lexgetc())!='\n' && c!=WEOF)
422 				;
423 			lexungetc(c);
424 			continue;
425 
426 		case '+':
427 			if ((c1 = lexgetc()) == '+')
428 				c = INC;
429 			else if (c1 == '=')
430 				c = AADD;
431 			else
432 				lexungetc(c1);
433 			break;
434 
435 		case '-':
436 			if ((c1 = lexgetc()) == '-')
437 				c = DEC;
438 			else if (c1 == '=')
439 				c = ASUB;
440 			else
441 				lexungetc(c1);
442 			break;
443 
444 		case '*':
445 			if ((c1 = lexgetc()) == '=')
446 				c = AMUL;
447 			else if (c1 == '*') {
448 				if ((c1 = lexgetc()) == '=')
449 					c = AEXP;
450 				else {
451 					c = EXP;
452 					lexungetc(c1);
453 				}
454 			} else
455 				lexungetc(c1);
456 			break;
457 
458 		case '^':
459 			if ((c1 = lexgetc()) == '=') {
460 				c = AEXP;
461 			} else {
462 				c = EXP;
463 				lexungetc(c1);
464 			}
465 			break;
466 
467 		case '/':
468 			if ((c1 = lexgetc()) == '='
469 			 && lexlast!=RE && lexlast!=NRE
470 			 && lexlast!=';' && lexlast!='\n'
471 			 && lexlast!=',' && lexlast!='(')
472 				c = ADIV;
473 			else
474 				lexungetc(c1);
475 			break;
476 
477 		case '%':
478 			if ((c1 = lexgetc()) == '=')
479 				c = AREM;
480 			else
481 				lexungetc(c1);
482 			break;
483 
484 		case '&':
485 			if ((c1 = lexgetc()) == '&')
486 				c = AND;
487 			else
488 				lexungetc(c1);
489 			break;
490 
491 		case '|':
492 			if ((c1 = lexgetc()) == '|')
493 				c = OR;
494 			else {
495 				lexungetc(c1);
496 				if (inprint)
497 					c = PIPE;
498 			}
499 			break;
500 
501 		case '>':
502 			if ((c1 = lexgetc()) == '=')
503 				c = GE;
504 			else if (c1 == '>')
505 				c = APPEND;
506 			else {
507 				lexungetc(c1);
508 				if (nparen==0 && inprint)
509 					c = WRITE;
510 			}
511 			break;
512 
513 		case '<':
514 			if ((c1 = lexgetc()) == '=')
515 				c = LE;
516 			else
517 				lexungetc(c1);
518 			break;
519 
520 		case '!':
521 			if ((c1 = lexgetc()) == '=')
522 				c = NE;
523 			else if (c1 == '~')
524 				c = NRE;
525 			else
526 				lexungetc(c1);
527 			break;
528 
529 		case '=':
530 			if ((c1 = lexgetc()) == '=')
531 				c = EQ;
532 			else {
533 				lexungetc(c1);
534 				c = ASG;
535 			}
536 			break;
537 
538 		case '\n':
539 			switch (lexlast) {
540 			case ')':
541 				if (catterm || inprint) {
542 					c = ';';
543 					break;
544 				}
545 			case AND:
546 			case OR:
547 			case COMMA:
548 			case '{':
549 			case ELSE:
550 			case ';':
551 			case DO:
552 				continue;
553 
554 			case '}':
555 				if (nbrace != 0)
556 					continue;
557 
558 			default:
559 				c = ';';
560 				break;
561 			}
562 			break;
563 
564 		case ELSE:
565 			if (lexlast != ';') {
566 				savetoken = ELSE;
567 				c = ';';
568 			}
569 			break;
570 
571 		case '(':
572 			++nparen;
573 			break;
574 
575 		case ')':
576 			if (--nparen < 0)
577 				awkerr(unbal, "()");
578 			break;
579 
580 		case '{':
581 			nbrace++;
582 			break;
583 
584 		case '}':
585 			if (--nbrace < 0) {
586 				char brk[3];
587 
588 				brk[0] = '{';
589 				brk[1] = '}';
590 				brk[2] = '\0';
591 				awkerr(unbal, brk);
592 			}
593 			if (lexlast != ';') {
594 				savetoken = c;
595 				c = ';';
596 			}
597 			break;
598 
599 		case '[':
600 			++nbracket;
601 			break;
602 
603 		case ']':
604 			if (--nbracket < 0) {
605 				char brk[3];
606 
607 				brk[0] = '[';
608 				brk[1] = ']';
609 				brk[2] = '\0';
610 				awkerr(unbal, brk);
611 			}
612 			break;
613 
614 		case '\\':
615 			if ((c1 = lexgetc()) == '\n')
616 				continue;
617 			lexungetc(c1);
618 			break;
619 
620 		case ',':
621 			c = COMMA;
622 			break;
623 
624 		case '?':
625 			c = QUEST;
626 			break;
627 
628 		case ':':
629 			c = COLON;
630 			break;
631 
632 		default:
633 			if (!iswprint(c))
634 				awkerr(
635 				   gettext("invalid character \"%s\""),
636 				   toprint(c));
637 			break;
638 		}
639 		break;
640 	}
641 
642 	switch (c) {
643 	case ']':
644 		++catterm;
645 		break;
646 
647 	case VAR:
648 		if (catterm) {
649 			savetoken = c;
650 			c = CONCAT;
651 			catterm = 0;
652 		} else if (!isfuncdef) {
653 			if ((c1=lexgetc()) != '(')
654 				++catterm;
655 			lexungetc(c1);
656 		}
657 		isfuncdef = 0;
658 		break;
659 
660 	case PARM:
661 	case CONSTANT:
662 		if (catterm) {
663 			savetoken = c;
664 			c = CONCAT;
665 			catterm = 0;
666 		} else {
667 			if (lexlast == '$')
668 				wasfield = 2;
669 			++catterm;
670 		}
671 		break;
672 
673 	case INC:
674 	case DEC:
675 		if (!catterm || lexlast!=CONSTANT || wasfield)
676 			break;
677 
678 	case UFUNC:
679 	case FUNC:
680 	case GETLINE:
681 	case '!':
682 	case '$':
683 	case '(':
684 		if (catterm) {
685 			savetoken = c;
686 			c = CONCAT;
687 			catterm = 0;
688 		}
689 		break;
690 
691 	/*{*/case '}':
692 		if (nbrace == 0)
693 			savetoken = ';';
694 	case ';':
695 		inprint = 0;
696 	default:
697 		if (c == DEFFUNC)
698 			isfuncdef = 1;
699 		catterm = 0;
700 	}
701 	lexlast = c;
702 	if (wasfield)
703 		wasfield--;
704 	/*
705 	 * Map character constants to symbolic names.
706 	 */
707 	for (i = 0; ctosym[i].c != 0; i++)
708 		if (c == ctosym[i].c) {
709 			c = ctosym[i].sym;
710 			break;
711 		}
712 	return ((int)c);
713 }
714 
715 /*
716  * Read a number for the lexical analyzer.
717  * Input is the first character of the number.
718  * Return value is the lexical type.
719  */
720 static int
721 lexnumber(wint_t c)
722 {
723 	register wchar_t *cp;
724 	register int dotfound = 0;
725 	register int efound = 0;
726 	INT number;
727 
728 	cp = linebuf;
729 	do {
730 		if (iswdigit(c))
731 			;
732 		else if (c == '.') {
733 			if (dotfound++)
734 				break;
735 		} else if (c=='e' || c=='E') {
736 			if ((c = lexgetc())!='-'  &&  c!='+') {
737 				lexungetc(c);
738 				c = 'e';
739 			} else
740 				*cp++ = 'e';
741 			if (efound++)
742 				break;
743 		} else
744 			break;
745 		*cp++ = c;
746 	} while ((c = lexgetc()) != WEOF);
747 	*cp = '\0';
748 	if (dotfound && cp==linebuf+1)
749 		return (DOT);
750 	lexungetc(c);
751 	errno = 0;
752 	if (!dotfound
753 	 && !efound
754 	 && ((number=wcstol(linebuf, (wchar_t **)0, 10)), errno!=ERANGE))
755 		yylval.node = intnode(number);
756 	else
757 		yylval.node = realnode((REAL)wcstod(linebuf, (wchar_t **)0));
758 	return (CONSTANT);
759 }
760 
761 /*
762  * Read an identifier.
763  * Input is first character of identifier.
764  * Return VAR.
765  */
766 static int
767 lexid(wint_t c)
768 {
769 	register wchar_t *cp;
770 	register size_t i;
771 	register NODE *np;
772 
773 	cp = linebuf;
774 	do {
775 		*cp++ = c;
776 		c = lexgetc();
777 	} while (iswalpha(c) || iswdigit(c) || c=='_');
778 	*cp = '\0';
779 	lexungetc(c);
780 	yylval.node = np = vlook(linebuf);
781 
782 	switch(np->n_type) {
783 	case KEYWORD:
784 		switch (np->n_keywtype) {
785 		case PRINT:
786 		case PRINTF:
787 			++inprint;
788 		default:
789 			return ((int)np->n_keywtype);
790 		}
791 		/* NOTREACHED */
792 
793 	case ARRAY:
794 	case VAR:
795 		/*
796 		 * If reading the argument list, create a dummy node
797 		 * for the duration of that function. These variables
798 		 * can be removed from the symbol table at function end
799 		 * but they must still exist because the execution tree
800 		 * knows about them.
801 		 */
802 		if (funparm) {
803 do_funparm:
804 			np = emptynode(PARM, i=(cp-linebuf));
805 			np->n_flags = FSTRING;
806 			np->n_string = _null;
807 			np->n_strlen = 0;
808 			(void) memcpy(np->n_name, linebuf,
809 				(i+1) * sizeof(wchar_t));
810 			addsymtab(np);
811 			yylval.node = np;
812 		} else if (np == varNF || (np == varFS &&
813 			(!doing_begin || begin_getline))) {
814 			/*
815 			 * If the user program references NF or sets
816 			 * FS either outside of a begin block or
817 			 * in a begin block after a getline then the
818 			 * input line will be split immediately upon read
819 			 * rather than when a field is first referenced.
820 			 */
821 			needsplit = 1;
822 		} else if (np == varENVIRON)
823 			needenviron = 1;
824 	case PARM:
825 		return (VAR);
826 
827 	case UFUNC:
828 		/*
829 		 * It is ok to redefine functions as parameters
830 		 */
831 		if (funparm) goto do_funparm;
832 	case FUNC:
833 	case GETLINE:
834 		/*
835 		 * When a getline is encountered, clear the 'doing_begin' flag.
836 		 * This will force the 'needsplit' flag to be set, even inside
837 		 * a begin block, if FS is altered. (See VAR case above)
838 		 */
839 		if (doing_begin)
840 			begin_getline = 1;
841 		return (np->n_type);
842 	}
843 	/* NOTREACHED */
844 	return (0);
845 }
846 
847 /*
848  * Read a string for the lexical analyzer.
849  * `endc' terminates the string.
850  */
851 static int
852 lexstring(wint_t endc)
853 {
854 	register size_t length = lexescape(endc, 0, 0);
855 
856 	yylval.node = stringnode(linebuf, FALLOC, length);
857 	return (CONSTANT);
858 }
859 
860 /*
861  * Read a regular expression.
862  */
863 static int
864 lexregexp(wint_t endc)
865 {
866 	(void) lexescape(endc, 1, 0);
867 	yylval.node = renode(linebuf);
868 	return (URE);
869 }
870 
871 /*
872  * Process a string, converting the escape characters as required by
873  * 1003.2. The processed string ends up in the global linebuf[]. This
874  * routine also changes the value of 'progfd' - the program file
875  * descriptor, so it should be used with some care. It is presently used to
876  * process -v (awk1.c) and var=str type arguments (awk2.c, nextrecord()).
877  */
878 void
879 strescape(wchar_t *str)
880 {
881 	progptr = str;
882 	proglen = wcslen(str) + 1;	/* Include \0 */
883 	(void) lexescape('\0', 0, 1);
884 	progptr = NULL;
885 }
886 
887 /*
888  * Read a string or regular expression, terminated by ``endc'',
889  * for lexical analyzer, processing escape sequences.
890  * Return string length.
891  */
892 static size_t
893 lexescape(wint_t endc, int regx, int cmd_line_operand)
894 {
895 	static char nlre[256];
896 	static char nlstr[256];
897 	static char eofre[256];
898 	static char eofstr[256];
899 	int first_time = 1;
900 	wint_t c;
901 	wchar_t *cp;
902 	int n, max;
903 
904 	if (first_time == 1) {
905 		(void) strcpy(nlre, gettext("Newline in regular expression\n"));
906 		(void) strcpy(nlstr, gettext("Newline in string\n"));
907 		(void) strcpy(eofre, gettext("EOF in regular expression\n"));
908 		(void) strcpy(eofstr, gettext("EOF in string\n"));
909 		first_time = 0;
910         }
911 
912 	cp = linebuf;
913 	while ((c = lexgetc()) != endc) {
914 		if (c == '\n')
915 			awkerr(regx ? nlre : nlstr);
916 		if (c == '\\') {
917 			switch (c = lexgetc(), c) {
918 			case '\\':
919 				if (regx)
920 					*cp++ = '\\';
921 				break;
922 
923 			case '/':
924 				c = '/';
925 				break;
926 
927 			case 'n':
928 				c = '\n';
929 				break;
930 
931 			case 'b':
932 				c = '\b';
933 				break;
934 
935 			case 't':
936 				c = '\t';
937 				break;
938 
939 			case 'r':
940 				c = '\r';
941 				break;
942 
943 			case 'f':
944 				c = '\f';
945 				break;
946 
947 			case 'v':
948 				c = '\v';
949 				break;
950 
951 			case 'a':
952 				c = (char) 0x07;
953 				break;
954 
955 			case 'x':
956 				n = 0;
957 				while (iswxdigit(c = lexgetc())) {
958 					if (iswdigit(c))
959 						c -= '0';
960 					else if (iswupper(c))
961 						c -= 'A'-10;
962 					else
963 						c -= 'a'-10;
964 					n = (n<<4) + c;
965 				}
966 				lexungetc(c);
967 				c = n;
968 				break;
969 
970 			case '0':
971 			case '1':
972 			case '2':
973 			case '3':
974 			case '4':
975 			case '5':
976 			case '6':
977 			case '7':
978 #if 0
979 /*
980  * Posix.2 draft 10 disallows the use of back-referencing - it explicitly
981  * requires processing of the octal escapes both in strings and
982  * regular expressions. The following code is disabled instead of
983  * removed as back-referencing may be reintroduced in a future draft
984  * of the standard.
985  */
986 				/*
987 				 * For regular expressions, we disallow
988 				 * \ooo to mean octal character, in favour
989 				 * of back referencing.
990 				 */
991 				if (regx) {
992 					*cp++ = '\\';
993 					break;
994 				}
995 #endif
996 				max = 3;
997 				n = 0;
998 				do {
999 					n = (n<<3) + c-'0';
1000 					if ((c = lexgetc())>'7' || c<'0')
1001 						break;
1002 				} while (--max);
1003 				lexungetc(c);
1004 				/*
1005 				 * an octal escape sequence must have at least
1006 				 * 2 digits after the backslash, otherwise
1007 				 * it gets passed straight thru for possible
1008 				 * use in backreferencing.
1009 				 */
1010 				if (max == 3) {
1011 					*cp++ = '\\';
1012 					n += '0';
1013 				}
1014 				c = n;
1015 				break;
1016 
1017 			case '\n':
1018 				continue;
1019 
1020 			default:
1021 				if (c != endc || cmd_line_operand) {
1022 					*cp++ = '\\';
1023 					if (c == endc)
1024 						lexungetc(c);
1025 				}
1026 			}
1027 		}
1028 		if (c == WEOF)
1029 			awkerr(regx ? eofre : eofstr);
1030 		*cp++ = c;
1031 	}
1032 	*cp = '\0';
1033 	return (cp - linebuf);
1034 }
1035 
1036 /*
1037  * Build a regular expression NODE.
1038  * Argument is the string holding the expression.
1039  */
1040 NODE *
1041 renode(wchar_t *s)
1042 {
1043 	register NODE *np;
1044 	int n;
1045 
1046 	np = emptynode(RE, 0);
1047 	np->n_left = np->n_right = NNULL;
1048 	np->n_regexp = (REGEXP)emalloc(sizeof(regex_t));
1049 	if ((n = REGWCOMP(np->n_regexp, s, REG_EXTENDED)) != REG_OK) {
1050 		int m;
1051 		char *p;
1052 
1053 		m = regerror(n, np->n_regexp, NULL, 0);
1054 		p = (char *)emalloc(m);
1055 		regerror(n, np->n_regexp, p, m);
1056 		awkerr("/%S/: %s", s, p);
1057 	}
1058 	return (np);
1059 }
1060 /*
1061  * Get a character for the lexical analyser routine.
1062  */
1063 static wint_t
1064 lexgetc()
1065 {
1066 	register wint_t c;
1067 	static char **files = &progfiles[0];
1068 
1069 	if (progfp!=FNULL && (c = fgetwc(progfp))!=WEOF)
1070 		;
1071 	else {
1072 		if (progptr != NULL) {
1073 			if (proglen-- <= 0)
1074 				c = WEOF;
1075 			else
1076 				c = *progptr++;
1077 		} else {
1078 			if (progfp != FNULL)
1079 				if (progfp != stdin)
1080 					(void)fclose(progfp);
1081 				else
1082 					clearerr(progfp);
1083 				progfp = FNULL;
1084 			if (files < progfilep) {
1085 				filename = *files++;
1086 				lineno = 1;
1087 				if (filename[0]=='-' && filename[1]=='\0')
1088 					progfp = stdin;
1089 				else if ((progfp=fopen(filename, r)) == FNULL) {
1090 					(void) fprintf(stderr,
1091 				gettext("script file \"%s\""), filename);
1092 					exit(1);
1093 				}
1094 				c = fgetwc(progfp);
1095 			}
1096 		}
1097 	}
1098 	if (c == '\n')
1099 		++lineno;
1100 	if (conptr >= &context[NCONTEXT])
1101 		conptr = &context[0];
1102 	if (c != WEOF)
1103 		*conptr++ = c;
1104 	return (c);
1105 }
1106 
1107 /*
1108  * Return a character for lexical analyser.
1109  * Only one returned character is (not enforced) legitimite.
1110  */
1111 static void
1112 lexungetc(wint_t c)
1113 {
1114 	if (c == '\n')
1115 		--lineno;
1116 	if (c != WEOF) {
1117 		if (conptr == &context[0])
1118 			conptr = &context[NCONTEXT];
1119 		*--conptr = '\0';
1120 	}
1121 	if (progfp != FNULL) {
1122 		(void)ungetwc(c, progfp);
1123 		return;
1124 	}
1125 	if (c == WEOF)
1126 		return;
1127 	*--progptr = c;
1128 	proglen++;
1129 }
1130 
1131 /*
1132  * Syntax errors during parsing.
1133  */
1134 void
1135 yyerror(char *s, ...)
1136 {
1137 	if (lexlast==FUNC || lexlast==GETLINE || lexlast==KEYWORD)
1138 		if (lexlast == KEYWORD)
1139 			awkerr(gettext("inadmissible use of reserved keyword"));
1140 		else
1141 			awkerr(gettext("attempt to redefine builtin function"));
1142 	awkerr(s);
1143 }
1144 
1145 /*
1146  * Error routine for all awk errors.
1147  */
1148 /* ARGSUSED */
1149 void
1150 awkerr(char *fmt, ...)
1151 {
1152 	va_list args;
1153 
1154 	va_start(args, fmt);
1155 	awkierr(0, fmt, args);
1156 	va_end(args);
1157 }
1158 
1159 /*
1160  * Error routine like "awkerr" except that it prints out
1161  * a message that includes an errno-specific indication.
1162  */
1163 /* ARGSUSED */
1164 void
1165 awkperr(char *fmt, ...)
1166 {
1167 	va_list args;
1168 
1169 	va_start(args, fmt);
1170 	awkierr(1, fmt, args);
1171 	va_end(args);
1172 }
1173 
1174 /*
1175  * Common internal routine for awkerr, awkperr
1176  */
1177 static void
1178 awkierr(int perr, char *fmt, va_list ap)
1179 {
1180 	static char sep1[] = "\n>>>\t";
1181 	static char sep2[] = "\t<<<";
1182 	int saveerr = errno;
1183 
1184 	(void) fprintf(stderr, "%s: ", _cmdname);
1185 	if (running) {
1186 		(void) fprintf(stderr, gettext("line %u ("),
1187 		    curnode==NNULL ? 0 : curnode->n_lineno);
1188 		if (phase == 0)
1189 		      (void) fprintf(stderr, "NR=%lld): ", (INT)exprint(varNR));
1190 		else
1191 		      (void) fprintf(stderr, "%s): ",
1192 			    phase==BEGIN ? s_BEGIN : s_END);
1193 	} else if (lineno != 0) {
1194 		(void) fprintf(stderr, gettext("file \"%s\": "), filename);
1195 		(void) fprintf(stderr, gettext("line %u: "), lineno);
1196 	}
1197 	(void) vfprintf(stderr, gettext(fmt), ap);
1198 	if (perr == 1)
1199 		(void) fprintf(stderr, ": %s", strerror(saveerr));
1200 	if (perr != 2 && !running) {
1201 		register wchar_t *cp;
1202 		register int n;
1203 		register int c;
1204 
1205 		(void) fprintf(stderr, gettext("  Context is:%s"), sep1);
1206 		cp = conptr;
1207 		n = NCONTEXT;
1208 		do {
1209 			if (cp >= &context[NCONTEXT])
1210 				cp = &context[0];
1211 			if ((c = *cp++) != '\0')
1212 				(void)fputs(c=='\n' ? sep1 : toprint(c),
1213 					stderr);
1214 		} while (--n != 0);
1215 		(void)fputs(sep2, stderr);
1216 	}
1217 	(void) fprintf(stderr, "\n");
1218 	exit(1);
1219 }
1220 
1221 wchar_t *
1222 emalloc(unsigned n)
1223 {
1224 	wchar_t *cp;
1225 
1226 	if ((cp = malloc(n)) == NULL)
1227 		awkerr(nomem);
1228 	return cp;
1229 }
1230 
1231 wchar_t *
1232 erealloc(wchar_t *p, unsigned n)
1233 {
1234 	wchar_t *cp;
1235 
1236 	if ((cp = realloc(p, n)) == NULL)
1237 		awkerr(nomem);
1238 	return cp;
1239 }
1240 
1241 
1242 /*
1243  * usage message for awk
1244  */
1245 static int
1246 usage()
1247 {
1248 	(void) fprintf(stderr, gettext(
1249 "Usage:	awk [-F ERE] [-v var=val] 'program' [var=val ...] [file ...]\n"
1250 "	awk [-F ERE] -f progfile ... [-v var=val] [var=val ...] [file ...]\n"));
1251 	return (2);
1252 }
1253 
1254 
1255 static wchar_t *
1256 mbconvert(char *str)
1257 {
1258 	static wchar_t *op = 0;
1259 
1260 	if (op != 0)
1261 		free(op);
1262 	return (op = mbstowcsdup(str));
1263 }
1264 
1265 char *
1266 mbunconvert(wchar_t *str)
1267 {
1268 	static char *op = 0;
1269 
1270 	if (op != 0)
1271 		free(op);
1272 	return (op = wcstombsdup(str));
1273 }
1274 
1275 /*
1276  * Solaris port - following functions are typical MKS functions written
1277  * to work for Solaris.
1278  */
1279 
1280 wchar_t *
1281 mbstowcsdup(s)
1282 char *s;
1283 {
1284         int n;
1285         wchar_t *w;
1286 
1287         n = strlen(s) + 1;
1288         if ((w = (wchar_t *)malloc(n * sizeof (wchar_t))) == NULL)
1289                 return (NULL);
1290 
1291         if (mbstowcs(w, s, n) == -1)
1292                 return (NULL);
1293         return (w);
1294 
1295 }
1296 
1297 char *
1298 wcstombsdup(wchar_t *w)
1299 {
1300         int n;
1301         char *mb;
1302 
1303         /* Fetch memory for worst case string length */
1304         n = wslen(w) + 1;
1305         n *= MB_CUR_MAX;
1306         if ((mb = (char *)malloc(n)) == NULL) {
1307                 return (NULL);
1308         }
1309 
1310         /* Convert the string */
1311         if ((n = wcstombs(mb, w, n)) == -1) {
1312                 int saverr = errno;
1313 
1314                 free(mb);
1315                 errno = saverr;
1316                 return (0);
1317         }
1318 
1319         /* Shrink the string down */
1320         if ((mb = (char *)realloc(mb, strlen(mb)+1)) == NULL)  {
1321                 return (NULL);
1322         }
1323         return (mb);
1324 }
1325 
1326 /*
1327  * The upe_ctrls[] table contains the printable 'control-sequences' for the
1328  * character values 0..31 and 127.  The first entry is for value 127, thus the
1329  * entries for the remaining character values are from 1..32.
1330  */
1331 static const char *const upe_ctrls[] =
1332 {
1333         "^?",
1334         "^@",  "^A",  "^B",  "^C",  "^D",  "^E",  "^F",  "^G",
1335         "^H",  "^I",  "^J",  "^K",  "^L",  "^M",  "^N",  "^O",
1336         "^P",  "^Q",  "^R",  "^S",  "^T",  "^U",  "^V",  "^W",
1337         "^X",  "^Y",  "^Z",  "^[",  "^\\", "^]",  "^^",  "^_"
1338 };
1339 
1340 
1341 /*
1342  * Return a printable string corresponding to the given character value.  If
1343  * the character is printable, simply return it as the string.  If it is in
1344  * the range specified by table 5-101 in the UPE, return the corresponding
1345  * string.  Otherwise, return an octal escape sequence.
1346  */
1347 static const char *
1348 toprint(c)
1349 wchar_t c;
1350 {
1351         int n, len;
1352         unsigned char *ptr;
1353         static char mbch[MB_LEN_MAX+1];
1354         static char buf[5 * MB_LEN_MAX + 1];
1355 
1356         if ((n = wctomb(mbch, c)) == -1) {
1357                 /* Should never happen */
1358                 (void) sprintf(buf, "\\%x", c);
1359                 return (buf);
1360         }
1361         mbch[n] = '\0';
1362         if (iswprint(c)) {
1363                 return (mbch);
1364         } else if (c == 127) {
1365                 return (upe_ctrls[0]);
1366         } else if (c < 32) {
1367                 /* Print as in Table 5-101 in the UPE */
1368                 return (upe_ctrls[c+1]);
1369         } else {
1370                 /* Print as an octal escape sequence */
1371                 for (len = 0, ptr = (unsigned char *) mbch; 0 < n; --n, ++ptr)
1372                         len += sprintf(buf+len, "\\%03o", *ptr);
1373         }
1374         return (buf);
1375 }
1376 
1377 static int
1378 wcoff(const wchar_t *astring, const int off)
1379 {
1380 	const wchar_t *s = astring;
1381 	int c = 0;
1382 	char mb[MB_LEN_MAX];
1383 
1384 	while (c < off) {
1385 		int n;
1386 		if ((n = wctomb(mb, *s)) == 0)
1387 			break;
1388 		if (n == -1)
1389 			n = 1;
1390 		c += n;
1391 		s++;
1392 	}
1393 
1394 	return (s - astring);
1395 }
1396 
1397 int
1398 int_regwcomp(register regex_t *r, const wchar_t *pattern, int uflags)
1399 {
1400 	char *mbpattern;
1401 	int ret;
1402 
1403 	if ((mbpattern = wcstombsdup((wchar_t *) pattern)) == NULL)
1404 		return (REG_ESPACE);
1405 
1406 	ret = regcomp(r, mbpattern, uflags);
1407 
1408 	free(mbpattern);
1409 
1410 	return (ret);
1411 }
1412 
1413 int
1414 int_regwexec(const regex_t *r,	/* compiled RE */
1415 	const wchar_t *astring,	/* subject string */
1416 	size_t nsub,		/* number of subexpressions */
1417 	int_regwmatch_t *sub,	/* subexpression pointers */
1418 	int flags)
1419 {
1420 	char *mbs;
1421 	regmatch_t *mbsub = NULL;
1422 	register int i;
1423 
1424 	if ((mbs = wcstombsdup((wchar_t *) astring)) == NULL)
1425 		return (REG_ESPACE);
1426 
1427 	if (nsub > 0 && sub) {
1428 		if ((mbsub = malloc(nsub * sizeof (regmatch_t))) == NULL)
1429 			return (REG_ESPACE);
1430 	}
1431 
1432 	i = regexec(r, mbs, nsub, mbsub, flags);
1433 
1434 	/* Now, adjust the pointers/counts in sub */
1435 	if (i == REG_OK && nsub > 0 && mbsub) {
1436 		register int j, k;
1437 
1438 		for (j = 0; j < nsub; j++) {
1439 			regmatch_t *ms = &mbsub[j];
1440 			int_regwmatch_t *ws = &sub[j];
1441 
1442 			if ((k = ms->rm_so) >= 0) {
1443 				ws->rm_so = wcoff(astring, k);
1444 				ws->rm_sp = astring + ws->rm_so;
1445 			}
1446 			if ((k = ms->rm_eo) >= 0) {
1447 				ws->rm_eo = wcoff(astring, k);
1448 				ws->rm_ep = astring + ws->rm_eo;
1449 			}
1450 		}
1451 	}
1452 
1453 	free(mbs);
1454 	if (mbsub)
1455 		free(mbsub);
1456 	return (i);
1457 }
1458 
1459 int
1460 int_regwdosuba(register regex_t *rp,	/* compiled RE: Pattern */
1461 	const wchar_t *rpl,		/* replacement string: /rpl/ */
1462 	const wchar_t *src,		/* source string */
1463 	wchar_t **dstp,			/* destination string */
1464 	int len,			/* destination length */
1465 	int *globp)	/* IN: occurence, 0 for all; OUT: substitutions */
1466 {
1467 	wchar_t *dst, *odst;
1468 	register const wchar_t *ip, *xp;
1469 	register wchar_t *op;
1470 	register int i;
1471 	register wchar_t c;
1472 	int glob, iglob = *globp, oglob = 0;
1473 #define	NSUB	10
1474 	int_regwmatch_t rm[NSUB], *rmp;
1475 	int flags;
1476 	wchar_t *end;
1477 	int regerr;
1478 
1479 /* handle overflow of dst. we need "i" more bytes */
1480 #ifdef OVERFLOW
1481 #undef OVERFLOW
1482 #define	OVERFLOW(i) if (1) { \
1483 		int pos = op - dst; \
1484 		dst = (wchar_t *) realloc(odst = dst, \
1485 			(len += len + i) * sizeof (wchar_t)); \
1486 		if (dst == NULL) \
1487 			goto nospace; \
1488 		op = dst + pos; \
1489 		end = dst + len; \
1490 	} else
1491 #endif
1492 
1493 	*dstp = dst = (wchar_t *) malloc(len * sizeof (wchar_t));
1494 	if (dst == NULL)
1495 		return (REG_ESPACE);
1496 
1497 	if (rp == NULL || rpl == NULL || src == NULL || dst ==  NULL)
1498 		return (REG_EFATAL);
1499 
1500 	glob = 0;	/* match count */
1501 	ip = src;	/* source position */
1502 	op = dst;	/* destination position */
1503 	end = dst + len;
1504 
1505 	flags = 0;
1506 	while ((regerr = int_regwexec(rp, ip, NSUB, rm, flags)) == REG_OK) {
1507 		/* Copy text preceding match */
1508 		if (op + (i = rm[0].rm_sp - ip) >= end)
1509 			OVERFLOW(i);
1510 		while (i--)
1511 			*op++ = *ip++;
1512 
1513 		if (iglob == 0 || ++glob == iglob) {
1514 			oglob++;
1515 			xp = rpl;		/* do substitute */
1516 		} else
1517 			xp = L"&";		/* preserve text */
1518 
1519 		/* Perform replacement of matched substing */
1520 		while ((c = *xp++) != '\0') {
1521 			rmp = NULL;
1522 			if (c == '&')
1523 				rmp = &rm[0];
1524 			else if (c == '\\') {
1525 				if ('0' <= *xp && *xp <= '9')
1526 					rmp = &rm[*xp++ - '0'];
1527 				else if (*xp != '\0')
1528 					c = *xp++;
1529 			}
1530 
1531 			if (rmp ==  NULL) {	/* Ordinary character. */
1532 				*op++ = c;
1533 				if (op >= end)
1534 					OVERFLOW(1);
1535 			} else if (rmp->rm_sp != NULL && rmp->rm_ep != NULL) {
1536 				ip = rmp->rm_sp;
1537 				if (op + (i = rmp->rm_ep - rmp->rm_sp) >= end)
1538 					OVERFLOW(i);
1539 				while (i--)
1540 					*op++ = *ip++;
1541 			}
1542 		}
1543 
1544 		ip = rm[0].rm_ep;
1545 		if (*ip == '\0')	/* If at end break */
1546 			break;
1547 		else if (rm[0].rm_sp == rm[0].rm_ep) {
1548 			/* If empty match copy next char */
1549 			*op++ = *ip++;
1550 			if (op >= end)
1551 				OVERFLOW(1);
1552 		}
1553 		flags = REG_NOTBOL;
1554 	}
1555 
1556 	if (regerr != REG_OK && regerr != REG_NOMATCH)
1557 		return (regerr);
1558 
1559 	/* Copy rest of text */
1560 	if (op + (i =  wcslen(ip)) >= end)
1561 		OVERFLOW(i);
1562 	while (i--)
1563 	    *op++ = *ip++;
1564 	*op++ = '\0';
1565 
1566 	if ((*dstp = dst = (wchar_t *) realloc(odst = dst,
1567 			sizeof (wchar_t) * (size_t)(op - dst))) == NULL) {
1568 nospace:
1569 		free(odst);
1570 		return (REG_ESPACE);
1571 	}
1572 
1573 	*globp = oglob;
1574 
1575 	return ((oglob == 0) ? REG_NOMATCH : REG_OK);
1576 }
1577