xref: /illumos-gate/usr/src/cmd/awk_xpg4/awk1.c (revision 1a90c98d7539778aeb0a1d20f735b66aaba17fca)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 1986, 1994 by Mortice Kern Systems Inc.  All rights reserved.
28  */
29 
30 /*
31  * awk -- mainline, yylex, etc.
32  *
33  * Based on MKS awk(1) ported to be /usr/xpg4/bin/awk with POSIX/XCU4 changes
34  */
35 
36 #include "awk.h"
37 #include "y.tab.h"
38 #include <stdarg.h>
39 #include <unistd.h>
40 #include <locale.h>
41 #include <search.h>
42 
43 static char	*progfiles[NPFILE];	/* Programmes files for yylex */
44 static char	**progfilep = &progfiles[0]; /* Pointer to last file */
45 static wchar_t	*progptr;		/* In-memory programme */
46 static int	proglen;		/* Length of progptr */
47 static wchar_t	context[NCONTEXT];	/* Circular buffer of context */
48 static wchar_t	*conptr = &context[0];	/* context ptr */
49 static FILE	*progfp;		/* Stdio stream for programme */
50 static char	*filename;
51 #ifdef	DEBUG
52 static int	dflag;
53 #endif
54 
55 #define	AWK_EXEC_MAGIC	"<MKS AWKC>"
56 #define	LEN_EXEC_MAGIC	10
57 
58 static char	unbal[] = "unbalanced E char";
59 
60 static void	awkarginit(int c, char **av);
61 static int	lexid(wint_t c);
62 static int	lexnumber(wint_t c);
63 static int	lexstring(wint_t endc);
64 static int	lexregexp(wint_t endc);
65 
66 static void	awkvarinit(void);
67 static wint_t	lexgetc(void);
68 static void	lexungetc(wint_t c);
69 static size_t	lexescape(wint_t endc, int regx, int cmd_line_operand);
70 static void	awkierr(int perr, const char *fmt, va_list ap) __NORETURN;
71 static int	usage(void);
72 void		strescape(wchar_t *str);
73 static const char	*toprint(wint_t);
74 char *_cmdname;
75 static wchar_t *mbconvert(char *str);
76 
77 extern int	isclvar(wchar_t *arg);
78 
79 /*
80  * mainline for awk
81  */
82 int
main(int argc,char * argv[])83 main(int argc, char *argv[])
84 {
85 	wchar_t *ap;
86 	char *cmd;
87 
88 	cmd = argv[0];
89 	_cmdname = cmd;
90 
91 	linebuf = emalloc(NLINE * sizeof (wchar_t));
92 
93 	/*
94 	 * At this point only messaging should be internationalized.
95 	 * numbers are still scanned as in the Posix locale.
96 	 */
97 	(void) setlocale(LC_ALL, "");
98 	(void) setlocale(LC_NUMERIC, "C");
99 #if !defined(TEXT_DOMAIN)
100 #define	TEXT_DOMAIN	"SYS_TEST"
101 #endif
102 	(void) textdomain(TEXT_DOMAIN);
103 
104 	awkvarinit();
105 	/* running = 1; */
106 	while (argc > 1 && *argv[1] == '-') {
107 		void *save_ptr = NULL;
108 		ap = mbstowcsdup(&argv[1][1]);
109 		if (ap == NULL)
110 			break;
111 		if (*ap == '\0') {
112 			free(ap);
113 			break;
114 		}
115 		save_ptr = (void *) ap;
116 		++argv;
117 		--argc;
118 		if (*ap == '-' && ap[1] == '\0')
119 			break;
120 		for (; *ap != '\0'; ++ap) {
121 			switch (*ap) {
122 #ifdef DEBUG
123 			case 'd':
124 				dflag = 1;
125 				continue;
126 
127 #endif
128 			case 'f':
129 				if (argc < 2) {
130 					(void) fprintf(stderr,
131 				gettext("Missing script file\n"));
132 					return (1);
133 				}
134 				*progfilep++ = argv[1];
135 				--argc;
136 				++argv;
137 				continue;
138 
139 			case 'F':
140 				if (ap[1] == '\0') {
141 					if (argc < 2) {
142 						(void) fprintf(stderr,
143 				gettext("Missing field separator\n"));
144 						return (1);
145 					}
146 					ap = mbstowcsdup(argv[1]);
147 					--argc;
148 					++argv;
149 				} else
150 					++ap;
151 				strescape(ap);
152 				strassign(varFS, linebuf, FALLOC,
153 				    wcslen(linebuf));
154 				break;
155 
156 			case 'v': {
157 				wchar_t *vp;
158 				wchar_t *arg;
159 
160 				if (argc < 2) {
161 					(void) fprintf(stderr,
162 		gettext("Missing variable assignment\n"));
163 					return (1);
164 				}
165 				arg = mbconvert(argv[1]);
166 				/*
167 				 * Ensure the variable expression
168 				 * is valid (correct form).
169 				 */
170 				if (((vp = wcschr(arg, '=')) != NULL) &&
171 				    isclvar(arg)) {
172 					*vp = '\0';
173 					strescape(vp+1);
174 					strassign(vlook(arg), linebuf,
175 					    FALLOC|FSENSE,
176 					    wcslen(linebuf));
177 					*vp = '=';
178 				} else {
179 					(void) fprintf(stderr, gettext(
180 					    "Invalid form for variable "
181 					    "assignment: %S\n"), arg);
182 					return (1);
183 				}
184 				--argc;
185 				++argv;
186 				continue;
187 			}
188 
189 			default:
190 				(void) fprintf(stderr,
191 				gettext("Unknown option \"-%S\"\n"), ap);
192 				return (usage());
193 			}
194 			break;
195 		}
196 		if (save_ptr)
197 			free(save_ptr);
198 	}
199 	if (progfilep == &progfiles[0]) {
200 		if (argc < 2)
201 			return (usage());
202 		filename = "[command line]";	/* BUG: NEEDS TRANSLATION */
203 		progptr = mbstowcsdup(argv[1]);
204 		proglen = wcslen(progptr);
205 		--argc;
206 		++argv;
207 	}
208 
209 	argv[0] = cmd;
210 
211 	awkarginit(argc, argv);
212 
213 	/* running = 0; */
214 	(void) yyparse();
215 
216 	lineno = 0;
217 	/*
218 	 * Ok, done parsing, so now activate the rest of the nls stuff, set
219 	 * the radix character.
220 	 */
221 	(void) setlocale(LC_ALL, "");
222 	radixpoint = *localeconv()->decimal_point;
223 	awk();
224 	/* NOTREACHED */
225 	return (0);
226 }
227 
228 /*
229  * Do initial setup of buffers, etc.
230  * This must be called before most processing
231  * and especially before lexical analysis.
232  * Variables initialised here will be overruled by command
233  * line parameter initialisation.
234  */
235 static void
awkvarinit()236 awkvarinit()
237 {
238 	NODE *np;
239 
240 	(void) setvbuf(stderr, NULL, _IONBF, 0);
241 
242 	if ((NIOSTREAM = sysconf(_SC_OPEN_MAX) - 4) <= 0) {
243 		(void) fprintf(stderr,
244 	gettext("not enough available file descriptors"));
245 		exit(1);
246 	}
247 	ofiles = (OFILE *)emalloc(sizeof (OFILE)*NIOSTREAM);
248 #ifdef A_ZERO_POINTERS
249 	(void) memset((wchar_t *)ofiles, 0, sizeof (OFILE) * NIOSTREAM);
250 #else
251 	{
252 		/* initialize file descriptor table */
253 		OFILE *fp;
254 		for (fp = ofiles; fp < &ofiles[NIOSTREAM]; fp += 1) {
255 			fp->f_fp = FNULL;
256 					fp->f_mode = 0;
257 					fp->f_name = (char *)0;
258 		}
259 	}
260 #endif
261 	constant = intnode((INT)0);
262 
263 	const0 = intnode((INT)0);
264 	const1 = intnode((INT)1);
265 	constundef = emptynode(CONSTANT, 0);
266 	constundef->n_flags = FSTRING|FVINT;
267 	constundef->n_string = _null;
268 	constundef->n_strlen = 0;
269 	inc_oper = emptynode(ADD, 0);
270 	inc_oper->n_right = const1;
271 	asn_oper = emptynode(ADD, 0);
272 	field0 = node(FIELD, const0, NNULL);
273 
274 	{
275 		RESFUNC near*rp;
276 
277 		for (rp = &resfuncs[0]; rp->rf_name != (LOCCHARP)NULL; ++rp) {
278 			np = finstall(rp->rf_name, rp->rf_func, rp->rf_type);
279 		}
280 	}
281 	{
282 		RESERVED near*rp;
283 
284 		for (rp = &reserved[0]; rp->r_name != (LOCCHARP)NULL; ++rp) {
285 			switch (rp->r_type) {
286 			case SVAR:
287 			case VAR:
288 				running = 1;
289 				np = vlook(rp->r_name);
290 				if (rp->r_type == SVAR)
291 					np->n_flags |= FSPECIAL;
292 				if (rp->r_svalue != NULL)
293 					strassign(np, rp->r_svalue, FSTATIC,
294 					    (size_t)rp->r_ivalue);
295 				else {
296 					constant->n_int = rp->r_ivalue;
297 					(void) assign(np, constant);
298 				}
299 				running = 0;
300 				break;
301 
302 			case KEYWORD:
303 				kinstall(rp->r_name, (int)rp->r_ivalue);
304 				break;
305 			}
306 		}
307 	}
308 
309 	varNR = vlook(s_NR);
310 	varFNR = vlook(s_FNR);
311 	varNF = vlook(s_NF);
312 	varOFMT = vlook(s_OFMT);
313 	varCONVFMT = vlook(s_CONVFMT);
314 	varOFS = vlook(s_OFS);
315 	varORS = vlook(s_ORS);
316 	varRS = vlook(s_RS);
317 	varFS = vlook(s_FS);
318 	varARGC = vlook(s_ARGC);
319 	varSUBSEP = vlook(s_SUBSEP);
320 	varENVIRON = vlook(s_ENVIRON);
321 	varFILENAME = vlook(s_FILENAME);
322 	varSYMTAB = vlook(s_SYMTAB);
323 	incNR = node(ASG, varNR, node(ADD, varNR, const1));
324 	incFNR = node(ASG, varFNR, node(ADD, varFNR, const1));
325 	clrFNR = node(ASG, varFNR, const0);
326 }
327 
328 /*
329  * Initialise awk ARGC, ARGV variables.
330  */
331 static void
awkarginit(int ac,char ** av)332 awkarginit(int ac, char **av)
333 {
334 	int i;
335 	wchar_t *cp;
336 
337 	ARGVsubi = node(INDEX, vlook(s_ARGV), constant);
338 	running = 1;
339 	constant->n_int = ac;
340 	(void) assign(varARGC, constant);
341 	for (i = 0; i < ac; ++i) {
342 		cp = mbstowcsdup(av[i]);
343 		constant->n_int = i;
344 		strassign(exprreduce(ARGVsubi), cp,
345 		    FSTATIC|FSENSE, wcslen(cp));
346 	}
347 	running = 0;
348 }
349 
350 /*
351  * Clean up when done parsing a function.
352  * All formal parameters, because of a deal (funparm) in
353  * yylex, get put into the symbol table in front of any
354  * global variable of the same name.  When the entire
355  * function is parsed, remove these formal dummy nodes
356  * from the symbol table but retain the nodes because
357  * the generated tree points at them.
358  */
359 void
uexit(NODE * np)360 uexit(NODE *np)
361 {
362 	NODE *formal;
363 
364 	while ((formal = getlist(&np)) != NNULL)
365 		delsymtab(formal, 0);
366 }
367 
368 /*
369  * The lexical analyzer.
370  */
371 int
yylex()372 yylex()
373 {
374 	wint_t c, c1;
375 	int i;
376 	static int savetoken = 0;
377 	static int wasfield;
378 	static int isfuncdef;
379 	static int nbrace, nparen, nbracket;
380 	static struct ctosymstruct {
381 		wint_t c, sym;
382 	} ctosym[] = {
383 		{ '|', BAR },		{ '^', CARAT },
384 		{ '~', TILDE },		{ '<', LANGLE },
385 		{ '>', RANGLE },	{ '+', PLUSC },
386 		{ '-', HYPHEN },	{ '*', STAR },
387 		{ '/', SLASH },		{ '%', PERCENT },
388 		{ '!', EXCLAMATION },	{ '$', DOLLAR },
389 		{ '[', LSQUARE },	{ ']', RSQUARE },
390 		{ '(', LPAREN },	{ ')', RPAREN },
391 		{ ';', SEMI },		{ '{', LBRACE },
392 		{ '}', RBRACE },	{   0, 0 }
393 	};
394 
395 	if (savetoken) {
396 		c = savetoken;
397 		savetoken = 0;
398 	} else if (redelim != '\0') {
399 		c = redelim;
400 		redelim = 0;
401 		catterm = 0;
402 		savetoken = c;
403 		c = lexlast = lexregexp(c);
404 		goto out;
405 	} else while ((c = lexgetc()) != WEOF) {
406 		if (iswalpha(c) || c == '_') {
407 			c = lexid(c);
408 		} else if (iswdigit(c) || c == '.') {
409 			c = lexnumber(c);
410 		} else if (isWblank(c)) {
411 			continue;
412 		} else switch (c) {
413 #if DOS || OS2
414 		case 032:		/* ^Z */
415 			continue;
416 #endif
417 
418 		case '"':
419 			c = lexstring(c);
420 			break;
421 
422 		case '#':
423 			while ((c = lexgetc()) != '\n' && c != WEOF)
424 				;
425 			lexungetc(c);
426 			continue;
427 
428 		case '+':
429 			if ((c1 = lexgetc()) == '+')
430 				c = INC;
431 			else if (c1 == '=')
432 				c = AADD;
433 			else
434 				lexungetc(c1);
435 			break;
436 
437 		case '-':
438 			if ((c1 = lexgetc()) == '-')
439 				c = DEC;
440 			else if (c1 == '=')
441 				c = ASUB;
442 			else
443 				lexungetc(c1);
444 			break;
445 
446 		case '*':
447 			if ((c1 = lexgetc()) == '=')
448 				c = AMUL;
449 			else if (c1 == '*') {
450 				if ((c1 = lexgetc()) == '=')
451 					c = AEXP;
452 				else {
453 					c = EXP;
454 					lexungetc(c1);
455 				}
456 			} else
457 				lexungetc(c1);
458 			break;
459 
460 		case '^':
461 			if ((c1 = lexgetc()) == '=') {
462 				c = AEXP;
463 			} else {
464 				c = EXP;
465 				lexungetc(c1);
466 			}
467 			break;
468 
469 		case '/':
470 			if ((c1 = lexgetc()) == '=' &&
471 			    lexlast != RE && lexlast != NRE &&
472 			    lexlast != ';' && lexlast != '\n' &&
473 			    lexlast != ',' && lexlast != '(')
474 				c = ADIV;
475 			else
476 				lexungetc(c1);
477 			break;
478 
479 		case '%':
480 			if ((c1 = lexgetc()) == '=')
481 				c = AREM;
482 			else
483 				lexungetc(c1);
484 			break;
485 
486 		case '&':
487 			if ((c1 = lexgetc()) == '&')
488 				c = AND;
489 			else
490 				lexungetc(c1);
491 			break;
492 
493 		case '|':
494 			if ((c1 = lexgetc()) == '|')
495 				c = OR;
496 			else {
497 				lexungetc(c1);
498 				if (inprint)
499 					c = PIPE;
500 			}
501 			break;
502 
503 		case '>':
504 			if ((c1 = lexgetc()) == '=')
505 				c = GE;
506 			else if (c1 == '>')
507 				c = APPEND;
508 			else {
509 				lexungetc(c1);
510 				if (nparen == 0 && inprint)
511 					c = WRITE;
512 			}
513 			break;
514 
515 		case '<':
516 			if ((c1 = lexgetc()) == '=')
517 				c = LE;
518 			else
519 				lexungetc(c1);
520 			break;
521 
522 		case '!':
523 			if ((c1 = lexgetc()) == '=')
524 				c = NE;
525 			else if (c1 == '~')
526 				c = NRE;
527 			else
528 				lexungetc(c1);
529 			break;
530 
531 		case '=':
532 			if ((c1 = lexgetc()) == '=')
533 				c = EQ;
534 			else {
535 				lexungetc(c1);
536 				c = ASG;
537 			}
538 			break;
539 
540 		case '\n':
541 			switch (lexlast) {
542 			case ')':
543 				if (catterm || inprint) {
544 					c = ';';
545 					break;
546 				}
547 			/* FALLTHROUGH */
548 			case AND:
549 			case OR:
550 			case COMMA:
551 			case '{':
552 			case ELSE:
553 			case ';':
554 			case DO:
555 				continue;
556 
557 			case '}':
558 				if (nbrace != 0)
559 					continue;
560 				/* FALLTHROUGH */
561 
562 			default:
563 				c = ';';
564 				break;
565 			}
566 			break;
567 
568 		case ELSE:
569 			if (lexlast != ';') {
570 				savetoken = ELSE;
571 				c = ';';
572 			}
573 			break;
574 
575 		case '(':
576 			++nparen;
577 			break;
578 
579 		case ')':
580 			if (--nparen < 0)
581 				awkerr(unbal, "()");
582 			break;
583 
584 		case '{':
585 			nbrace++;
586 			break;
587 
588 		case '}':
589 			if (--nbrace < 0) {
590 				char brk[3];
591 
592 				brk[0] = '{';
593 				brk[1] = '}';
594 				brk[2] = '\0';
595 				awkerr(unbal, brk);
596 			}
597 			if (lexlast != ';') {
598 				savetoken = c;
599 				c = ';';
600 			}
601 			break;
602 
603 		case '[':
604 			++nbracket;
605 			break;
606 
607 		case ']':
608 			if (--nbracket < 0) {
609 				char brk[3];
610 
611 				brk[0] = '[';
612 				brk[1] = ']';
613 				brk[2] = '\0';
614 				awkerr(unbal, brk);
615 			}
616 			break;
617 
618 		case '\\':
619 			if ((c1 = lexgetc()) == '\n')
620 				continue;
621 			lexungetc(c1);
622 			break;
623 
624 		case ',':
625 			c = COMMA;
626 			break;
627 
628 		case '?':
629 			c = QUEST;
630 			break;
631 
632 		case ':':
633 			c = COLON;
634 			break;
635 
636 		default:
637 			if (!iswprint(c))
638 				awkerr(
639 				    gettext("invalid character \"%s\""),
640 				    toprint(c));
641 			break;
642 		}
643 		break;
644 	}
645 
646 	switch (c) {
647 	case ']':
648 		++catterm;
649 		break;
650 
651 	case VAR:
652 		if (catterm) {
653 			savetoken = c;
654 			c = CONCAT;
655 			catterm = 0;
656 		} else if (!isfuncdef) {
657 			if ((c1 = lexgetc()) != '(')
658 				++catterm;
659 			lexungetc(c1);
660 		}
661 		isfuncdef = 0;
662 		break;
663 
664 	case PARM:
665 	case CONSTANT:
666 		if (catterm) {
667 			savetoken = c;
668 			c = CONCAT;
669 			catterm = 0;
670 		} else {
671 			if (lexlast == '$')
672 				wasfield = 2;
673 			++catterm;
674 		}
675 		break;
676 
677 	case INC:
678 	case DEC:
679 		if (!catterm || lexlast != CONSTANT || wasfield)
680 			break;
681 
682 	/* FALLTHROUGH */
683 	case UFUNC:
684 	case FUNC:
685 	case GETLINE:
686 	case '!':
687 	case '$':
688 	case '(':
689 		if (catterm) {
690 			savetoken = c;
691 			c = CONCAT;
692 			catterm = 0;
693 		}
694 		break;
695 
696 	case '}':
697 		if (nbrace == 0)
698 			savetoken = ';';
699 	/* FALLTHROUGH */
700 	case ';':
701 		inprint = 0;
702 	/* FALLTHROUGH */
703 	default:
704 		if (c == DEFFUNC)
705 			isfuncdef = 1;
706 		catterm = 0;
707 	}
708 	lexlast = c;
709 	if (wasfield)
710 		wasfield--;
711 	/*
712 	 * Map character constants to symbolic names.
713 	 */
714 	for (i = 0; ctosym[i].c != 0; i++)
715 		if (c == ctosym[i].c) {
716 			c = ctosym[i].sym;
717 			break;
718 		}
719 out:
720 #ifdef DEBUG
721 	if (dflag)
722 		(void) printf("%d\n", (int)c);
723 #endif
724 	return ((int)c);
725 }
726 
727 /*
728  * Read a number for the lexical analyzer.
729  * Input is the first character of the number.
730  * Return value is the lexical type.
731  */
732 static int
lexnumber(wint_t c)733 lexnumber(wint_t c)
734 {
735 	wchar_t *cp;
736 	int dotfound = 0;
737 	int efound = 0;
738 	INT number;
739 
740 	cp = linebuf;
741 	do {
742 		if (iswdigit(c))
743 			;
744 		else if (c == '.') {
745 			if (dotfound++)
746 				break;
747 		} else if (c == 'e' || c == 'E') {
748 			if ((c = lexgetc()) != '-' && c != '+') {
749 				lexungetc(c);
750 				c = 'e';
751 			} else
752 				*cp++ = 'e';
753 			if (efound++)
754 				break;
755 		} else
756 			break;
757 		*cp++ = c;
758 	} while ((c = lexgetc()) != WEOF);
759 	*cp = '\0';
760 	if (dotfound && cp == linebuf+1)
761 		return (DOT);
762 	lexungetc(c);
763 	errno = 0;
764 	if (!dotfound && !efound &&
765 	    ((number = wcstol(linebuf, (wchar_t **)0, 10)), errno != ERANGE))
766 		yylval.node = intnode(number);
767 	else
768 		yylval.node = realnode((REAL)wcstod(linebuf, (wchar_t **)0));
769 	return (CONSTANT);
770 }
771 
772 /*
773  * Read an identifier.
774  * Input is first character of identifier.
775  * Return VAR.
776  */
777 static int
lexid(wint_t c)778 lexid(wint_t c)
779 {
780 	wchar_t *cp;
781 	size_t i;
782 	NODE *np;
783 
784 	cp = linebuf;
785 	do {
786 		*cp++ = c;
787 		c = lexgetc();
788 	} while (iswalpha(c) || iswdigit(c) || c == '_');
789 	*cp = '\0';
790 	lexungetc(c);
791 	yylval.node = np = vlook(linebuf);
792 
793 	switch (np->n_type) {
794 	case KEYWORD:
795 		switch (np->n_keywtype) {
796 		case PRINT:
797 		case PRINTF:
798 			++inprint;
799 			/* FALLTHROUGH */
800 		default:
801 			return ((int)np->n_keywtype);
802 		}
803 		/* NOTREACHED */
804 
805 	case ARRAY:
806 	case VAR:
807 		/*
808 		 * If reading the argument list, create a dummy node
809 		 * for the duration of that function. These variables
810 		 * can be removed from the symbol table at function end
811 		 * but they must still exist because the execution tree
812 		 * knows about them.
813 		 */
814 		if (funparm) {
815 do_funparm:
816 			np = emptynode(PARM, i = (cp-linebuf));
817 			np->n_flags = FSTRING;
818 			np->n_string = _null;
819 			np->n_strlen = 0;
820 			(void) memcpy(np->n_name, linebuf,
821 			    (i+1) * sizeof (wchar_t));
822 			addsymtab(np);
823 			yylval.node = np;
824 		} else if (np == varNF || (np == varFS &&
825 		    (!doing_begin || begin_getline))) {
826 			/*
827 			 * If the user program references NF or sets
828 			 * FS either outside of a begin block or
829 			 * in a begin block after a getline then the
830 			 * input line will be split immediately upon read
831 			 * rather than when a field is first referenced.
832 			 */
833 			needsplit = 1;
834 		} else if (np == varENVIRON)
835 			needenviron = 1;
836 	/* FALLTHROUGH */
837 	case PARM:
838 		return (VAR);
839 
840 	case UFUNC:
841 		/*
842 		 * It is ok to redefine functions as parameters
843 		 */
844 		if (funparm) goto do_funparm;
845 	/* FALLTHROUGH */
846 	case FUNC:
847 	case GETLINE:
848 		/*
849 		 * When a getline is encountered, clear the 'doing_begin' flag.
850 		 * This will force the 'needsplit' flag to be set, even inside
851 		 * a begin block, if FS is altered. (See VAR case above)
852 		 */
853 		if (doing_begin)
854 			begin_getline = 1;
855 		return (np->n_type);
856 	}
857 	/* NOTREACHED */
858 	return (0);
859 }
860 
861 /*
862  * Read a string for the lexical analyzer.
863  * `endc' terminates the string.
864  */
865 static int
lexstring(wint_t endc)866 lexstring(wint_t endc)
867 {
868 	size_t length = lexescape(endc, 0, 0);
869 
870 	yylval.node = stringnode(linebuf, FALLOC, length);
871 	return (CONSTANT);
872 }
873 
874 /*
875  * Read a regular expression.
876  */
877 static int
lexregexp(wint_t endc)878 lexregexp(wint_t endc)
879 {
880 	(void) lexescape(endc, 1, 0);
881 	yylval.node = renode(linebuf);
882 	return (URE);
883 }
884 
885 /*
886  * Process a string, converting the escape characters as required by
887  * 1003.2. The processed string ends up in the global linebuf[]. This
888  * routine also changes the value of 'progfd' - the program file
889  * descriptor, so it should be used with some care. It is presently used to
890  * process -v (awk1.c) and var=str type arguments (awk2.c, nextrecord()).
891  */
892 void
strescape(wchar_t * str)893 strescape(wchar_t *str)
894 {
895 	progptr = str;
896 	proglen = wcslen(str) + 1;	/* Include \0 */
897 	(void) lexescape('\0', 0, 1);
898 	progptr = NULL;
899 }
900 
901 /*
902  * Read a string or regular expression, terminated by ``endc'',
903  * for lexical analyzer, processing escape sequences.
904  * Return string length.
905  */
906 static size_t
lexescape(wint_t endc,int regx,int cmd_line_operand)907 lexescape(wint_t endc, int regx, int cmd_line_operand)
908 {
909 	static char nlre[256];
910 	static char nlstr[256];
911 	static char eofre[256];
912 	static char eofstr[256];
913 	int first_time = 1;
914 	wint_t c;
915 	wchar_t *cp;
916 	int n, max;
917 
918 	if (first_time == 1) {
919 		(void) strcpy(nlre, gettext("Newline in regular expression\n"));
920 		(void) strcpy(nlstr, gettext("Newline in string\n"));
921 		(void) strcpy(eofre, gettext("EOF in regular expression\n"));
922 		(void) strcpy(eofstr, gettext("EOF in string\n"));
923 		first_time = 0;
924 	}
925 
926 	cp = linebuf;
927 	while ((c = lexgetc()) != endc) {
928 		if (c == '\n')
929 			awkerr(regx ? nlre : nlstr);
930 		if (c == '\\') {
931 			switch (c = lexgetc(), c) {
932 			case '\\':
933 				if (regx)
934 					*cp++ = '\\';
935 				break;
936 
937 			case '/':
938 				c = '/';
939 				break;
940 
941 			case 'n':
942 				c = '\n';
943 				break;
944 
945 			case 'b':
946 				c = '\b';
947 				break;
948 
949 			case 't':
950 				c = '\t';
951 				break;
952 
953 			case 'r':
954 				c = '\r';
955 				break;
956 
957 			case 'f':
958 				c = '\f';
959 				break;
960 
961 			case 'v':
962 				c = '\v';
963 				break;
964 
965 			case 'a':
966 				c = (char)0x07;
967 				break;
968 
969 			case 'x':
970 				n = 0;
971 				while (iswxdigit(c = lexgetc())) {
972 					if (iswdigit(c))
973 						c -= '0';
974 					else if (iswupper(c))
975 						c -= 'A'-10;
976 					else
977 						c -= 'a'-10;
978 					n = (n<<4) + c;
979 				}
980 				lexungetc(c);
981 				c = n;
982 				break;
983 
984 			case '0':
985 			case '1':
986 			case '2':
987 			case '3':
988 			case '4':
989 			case '5':
990 			case '6':
991 			case '7':
992 #if 0
993 /*
994  * Posix.2 draft 10 disallows the use of back-referencing - it explicitly
995  * requires processing of the octal escapes both in strings and
996  * regular expressions. The following code is disabled instead of
997  * removed as back-referencing may be reintroduced in a future draft
998  * of the standard.
999  */
1000 				/*
1001 				 * For regular expressions, we disallow
1002 				 * \ooo to mean octal character, in favour
1003 				 * of back referencing.
1004 				 */
1005 				if (regx) {
1006 					*cp++ = '\\';
1007 					break;
1008 				}
1009 #endif
1010 				max = 3;
1011 				n = 0;
1012 				do {
1013 					n = (n<<3) + c-'0';
1014 					if ((c = lexgetc()) > '7' || c < '0')
1015 						break;
1016 				} while (--max);
1017 				lexungetc(c);
1018 				/*
1019 				 * an octal escape sequence must have at least
1020 				 * 2 digits after the backslash, otherwise
1021 				 * it gets passed straight thru for possible
1022 				 * use in backreferencing.
1023 				 */
1024 				if (max == 3) {
1025 					*cp++ = '\\';
1026 					n += '0';
1027 				}
1028 				c = n;
1029 				break;
1030 
1031 			case '\n':
1032 				continue;
1033 
1034 			default:
1035 				if (c != endc || cmd_line_operand) {
1036 					*cp++ = '\\';
1037 					if (c == endc)
1038 						lexungetc(c);
1039 				}
1040 			}
1041 		}
1042 		if (c == WEOF)
1043 			awkerr(regx ? eofre : eofstr);
1044 		*cp++ = c;
1045 	}
1046 	*cp = '\0';
1047 	return (cp - linebuf);
1048 }
1049 
1050 /*
1051  * Build a regular expression NODE.
1052  * Argument is the string holding the expression.
1053  */
1054 NODE *
renode(wchar_t * s)1055 renode(wchar_t *s)
1056 {
1057 	NODE *np;
1058 	int n;
1059 
1060 	np = emptynode(RE, 0);
1061 	np->n_left = np->n_right = NNULL;
1062 	if ((n = REGWCOMP(&np->n_regexp, s)) != REG_OK) {
1063 		int m;
1064 		char *p;
1065 
1066 		m = REGWERROR(n, np->n_regexp, NULL, 0);
1067 		p = (char *)emalloc(m);
1068 		REGWERROR(n, np->n_regexp, p, m);
1069 		awkerr("/%S/: %s", s, p);
1070 	}
1071 	return (np);
1072 }
1073 /*
1074  * Get a character for the lexical analyser routine.
1075  */
1076 static wint_t
lexgetc()1077 lexgetc()
1078 {
1079 	wint_t c;
1080 	static char **files = &progfiles[0];
1081 
1082 	if (progfp != FNULL && (c = fgetwc(progfp)) != WEOF)
1083 		;
1084 	else {
1085 		if (progptr != NULL) {
1086 			if (proglen-- <= 0)
1087 				c = WEOF;
1088 			else
1089 				c = *progptr++;
1090 		} else {
1091 			if (progfp != FNULL) {
1092 				if (progfp != stdin)
1093 					(void) fclose(progfp);
1094 				else
1095 					clearerr(progfp);
1096 				progfp = FNULL;
1097 			}
1098 			if (files < progfilep) {
1099 				filename = *files++;
1100 				lineno = 1;
1101 				if (filename[0] == '-' && filename[1] == '\0')
1102 					progfp = stdin;
1103 				else if ((progfp = fopen(filename, r))
1104 				    == FNULL) {
1105 					(void) fprintf(stderr,
1106 				gettext("script file \"%s\""), filename);
1107 					exit(1);
1108 				}
1109 				c = fgetwc(progfp);
1110 			}
1111 		}
1112 	}
1113 	if (c == '\n')
1114 		++lineno;
1115 	if (conptr >= &context[NCONTEXT])
1116 		conptr = &context[0];
1117 	if (c != WEOF)
1118 		*conptr++ = c;
1119 	return (c);
1120 }
1121 
1122 /*
1123  * Return a character for lexical analyser.
1124  * Only one returned character is (not enforced) legitimite.
1125  */
1126 static void
lexungetc(wint_t c)1127 lexungetc(wint_t c)
1128 {
1129 	if (c == '\n')
1130 		--lineno;
1131 	if (c != WEOF) {
1132 		if (conptr == &context[0])
1133 			conptr = &context[NCONTEXT];
1134 		*--conptr = '\0';
1135 	}
1136 	if (progfp != FNULL) {
1137 		(void) ungetwc(c, progfp);
1138 		return;
1139 	}
1140 	if (c == WEOF)
1141 		return;
1142 	*--progptr = c;
1143 	proglen++;
1144 }
1145 
1146 /*
1147  * Syntax errors during parsing.
1148  */
1149 int
yyerror(const char * s,...)1150 yyerror(const char *s, ...)
1151 {
1152 	if (lexlast == FUNC || lexlast == GETLINE || lexlast == KEYWORD)
1153 		if (lexlast == KEYWORD)
1154 			awkerr(gettext("inadmissible use of reserved keyword"));
1155 		else
1156 			awkerr(gettext("attempt to redefine builtin function"));
1157 	awkerr(s);
1158 	return (0);
1159 }
1160 
1161 /*
1162  * Error routine for all awk errors.
1163  */
1164 void
awkerr(const char * fmt,...)1165 awkerr(const char *fmt, ...)
1166 {
1167 	va_list args;
1168 
1169 	va_start(args, fmt);
1170 	awkierr(0, fmt, args);
1171 	va_end(args);
1172 }
1173 
1174 /*
1175  * Error routine like "awkerr" except that it prints out
1176  * a message that includes an errno-specific indication.
1177  */
1178 void
awkperr(const char * fmt,...)1179 awkperr(const char *fmt, ...)
1180 {
1181 	va_list args;
1182 
1183 	va_start(args, fmt);
1184 	awkierr(1, fmt, args);
1185 	va_end(args);
1186 }
1187 
1188 /*
1189  * Common internal routine for awkerr, awkperr
1190  */
1191 static void
awkierr(int perr,const char * fmt,va_list ap)1192 awkierr(int perr, const char *fmt, va_list ap)
1193 {
1194 	static char sep1[] = "\n>>>\t";
1195 	static char sep2[] = "\t<<<";
1196 	int saveerr = errno;
1197 
1198 	(void) fprintf(stderr, "%s: ", _cmdname);
1199 	if (running) {
1200 		(void) fprintf(stderr, gettext("line %u ("),
1201 		    curnode == NNULL ? 0 : curnode->n_lineno);
1202 		if (phase == 0)
1203 			(void) fprintf(stderr, "NR=%lld): ",
1204 			    (INT)exprint(varNR));
1205 		else
1206 			(void) fprintf(stderr, "%s): ",
1207 			    phase == BEGIN ? s_BEGIN : s_END);
1208 	} else if (lineno != 0) {
1209 		(void) fprintf(stderr, gettext("file \"%s\": "), filename);
1210 		(void) fprintf(stderr, gettext("line %u: "), lineno);
1211 	}
1212 	(void) vfprintf(stderr, gettext(fmt), ap);
1213 	if (perr == 1)
1214 		(void) fprintf(stderr, ": %s", strerror(saveerr));
1215 	if (perr != 2 && !running) {
1216 		wchar_t *cp;
1217 		int n;
1218 		int c;
1219 
1220 		(void) fprintf(stderr, gettext("  Context is:%s"), sep1);
1221 		cp = conptr;
1222 		n = NCONTEXT;
1223 		do {
1224 			if (cp >= &context[NCONTEXT])
1225 				cp = &context[0];
1226 			if ((c = *cp++) != '\0')
1227 				(void) fputs(c == '\n' ? sep1 : toprint(c),
1228 				    stderr);
1229 		} while (--n != 0);
1230 		(void) fputs(sep2, stderr);
1231 	}
1232 	(void) fprintf(stderr, "\n");
1233 	exit(1);
1234 }
1235 
1236 wchar_t *
emalloc(unsigned n)1237 emalloc(unsigned n)
1238 {
1239 	wchar_t *cp;
1240 
1241 	if ((cp = malloc(n)) == NULL)
1242 		awkerr(nomem);
1243 	return (cp);
1244 }
1245 
1246 wchar_t *
erealloc(wchar_t * p,unsigned n)1247 erealloc(wchar_t *p, unsigned n)
1248 {
1249 	wchar_t *cp;
1250 
1251 	if ((cp = realloc(p, n)) == NULL)
1252 		awkerr(nomem);
1253 	return (cp);
1254 }
1255 
1256 
1257 /*
1258  * usage message for awk
1259  */
1260 static int
usage()1261 usage()
1262 {
1263 	(void) fprintf(stderr, gettext(
1264 "Usage:	awk [-F ERE] [-v var=val] 'program' [var=val ...] [file ...]\n"
1265 "	awk [-F ERE] -f progfile ... [-v var=val] [var=val ...] [file ...]\n"));
1266 	return (2);
1267 }
1268 
1269 
1270 static wchar_t *
mbconvert(char * str)1271 mbconvert(char *str)
1272 {
1273 	static wchar_t *op = 0;
1274 
1275 	if (op != 0)
1276 		free(op);
1277 	return (op = mbstowcsdup(str));
1278 }
1279 
1280 char *
mbunconvert(wchar_t * str)1281 mbunconvert(wchar_t *str)
1282 {
1283 	static char *op = 0;
1284 
1285 	if (op != 0)
1286 		free(op);
1287 	return (op = wcstombsdup(str));
1288 }
1289 
1290 /*
1291  * Solaris port - following functions are typical MKS functions written
1292  * to work for Solaris.
1293  */
1294 
1295 wchar_t *
mbstowcsdup(char * s)1296 mbstowcsdup(char *s)
1297 {
1298 	int n;
1299 	wchar_t *w;
1300 
1301 	n = strlen(s) + 1;
1302 	if ((w = (wchar_t *)malloc(n * sizeof (wchar_t))) == NULL)
1303 		return (NULL);
1304 
1305 	if (mbstowcs(w, s, n) == (size_t)-1)
1306 		return (NULL);
1307 	return (w);
1308 
1309 }
1310 
1311 char *
wcstombsdup(wchar_t * w)1312 wcstombsdup(wchar_t *w)
1313 {
1314 	int n;
1315 	char *mb;
1316 
1317 	/* Fetch memory for worst case string length */
1318 	n = wslen(w) + 1;
1319 	n *= MB_CUR_MAX;
1320 	if ((mb = (char *)malloc(n)) == NULL) {
1321 		return (NULL);
1322 	}
1323 
1324 	/* Convert the string */
1325 	if ((n = wcstombs(mb, w, n)) == -1) {
1326 		int saverr = errno;
1327 
1328 		free(mb);
1329 		errno = saverr;
1330 		return (0);
1331 	}
1332 
1333 	/* Shrink the string down */
1334 	if ((mb = (char *)realloc(mb, strlen(mb)+1)) == NULL)  {
1335 		return (NULL);
1336 	}
1337 	return (mb);
1338 }
1339 
1340 /*
1341  * The upe_ctrls[] table contains the printable 'control-sequences' for the
1342  * character values 0..31 and 127.  The first entry is for value 127, thus the
1343  * entries for the remaining character values are from 1..32.
1344  */
1345 static const char *const upe_ctrls[] =
1346 {
1347 	"^?",
1348 	"^@",  "^A",  "^B",  "^C",  "^D",  "^E",  "^F",  "^G",
1349 	"^H",  "^I",  "^J",  "^K",  "^L",  "^M",  "^N",  "^O",
1350 	"^P",  "^Q",  "^R",  "^S",  "^T",  "^U",  "^V",  "^W",
1351 	"^X",  "^Y",  "^Z",  "^[",  "^\\", "^]",  "^^",  "^_"
1352 };
1353 
1354 
1355 /*
1356  * Return a printable string corresponding to the given character value.  If
1357  * the character is printable, simply return it as the string.  If it is in
1358  * the range specified by table 5-101 in the UPE, return the corresponding
1359  * string.  Otherwise, return an octal escape sequence.
1360  */
1361 static const char *
toprint(wchar_t c)1362 toprint(wchar_t c)
1363 {
1364 	int n, len;
1365 	unsigned char *ptr;
1366 	static char mbch[MB_LEN_MAX+1];
1367 	static char buf[5 * MB_LEN_MAX + 1];
1368 
1369 	if ((n = wctomb(mbch, c)) == -1) {
1370 		/* Should never happen */
1371 		(void) sprintf(buf, "\\%x", c);
1372 		return (buf);
1373 	}
1374 	mbch[n] = '\0';
1375 	if (iswprint(c)) {
1376 		return (mbch);
1377 	} else if (c == 127) {
1378 		return (upe_ctrls[0]);
1379 	} else if (c < 32) {
1380 		/* Print as in Table 5-101 in the UPE */
1381 		return (upe_ctrls[c+1]);
1382 	} else {
1383 		/* Print as an octal escape sequence */
1384 		for (len = 0, ptr = (unsigned char *) mbch; 0 < n; --n, ++ptr)
1385 			len += sprintf(buf+len, "\\%03o", *ptr);
1386 	}
1387 	return (buf);
1388 }
1389 
1390 static int
wcoff(const wchar_t * astring,const int off)1391 wcoff(const wchar_t *astring, const int off)
1392 {
1393 	const wchar_t *s = astring;
1394 	int c = 0;
1395 	char mb[MB_LEN_MAX];
1396 
1397 	while (c < off) {
1398 		int n;
1399 		if ((n = wctomb(mb, *s)) == 0)
1400 			break;
1401 		if (n == -1)
1402 			n = 1;
1403 		c += n;
1404 		s++;
1405 	}
1406 
1407 	return (s - astring);
1408 }
1409 
1410 #define	NREGHASH	64
1411 #define	NREGHOLD	1024	/* max number unused entries */
1412 
1413 static int	nregunref;
1414 
1415 struct reghashq {
1416 	struct qelem hq;
1417 	struct regcache *regcachep;
1418 };
1419 
1420 struct regcache {
1421 	struct qelem	lq;
1422 	wchar_t	*pattern;
1423 	regex_t	re;
1424 	int	refcnt;
1425 	struct reghashq	hash;
1426 };
1427 
1428 static struct qelem reghash[NREGHASH], reglink;
1429 
1430 /*
1431  * Generate a hash value of the given wchar string.
1432  * The hashing method is similar to what Java does for strings.
1433  */
1434 static uint_t
regtxthash(const wchar_t * str)1435 regtxthash(const wchar_t *str)
1436 {
1437 	int k = 0;
1438 
1439 	while (*str != L'\0')
1440 		k = (31 * k) + *str++;
1441 
1442 	k += ~(k << 9);
1443 	k ^=  (k >> 14);
1444 	k +=  (k << 4);
1445 	k ^=  (k >> 10);
1446 
1447 	return (k % NREGHASH);
1448 }
1449 
1450 int
int_regwcomp(REGEXP * r,const wchar_t * pattern)1451 int_regwcomp(REGEXP *r, const wchar_t *pattern)
1452 {
1453 	regex_t re;
1454 	char *mbpattern;
1455 	int ret;
1456 	uint_t key;
1457 	struct qelem *qp;
1458 	struct regcache *rcp;
1459 
1460 	key = regtxthash(pattern);
1461 	for (qp = reghash[key].q_forw; qp != NULL; qp = qp->q_forw) {
1462 		rcp = ((struct reghashq *)qp)->regcachep;
1463 		if (*rcp->pattern == *pattern &&
1464 		    wcscmp(rcp->pattern, pattern) == 0)
1465 			break;
1466 	}
1467 	if (qp != NULL) {
1468 		/* update link. put this one at the beginning */
1469 		if (rcp != (struct regcache *)reglink.q_forw) {
1470 			remque(&rcp->lq);
1471 			insque(&rcp->lq, &reglink);
1472 		}
1473 		if (rcp->refcnt == 0)
1474 			nregunref--;	/* no longer unref'ed */
1475 		rcp->refcnt++;
1476 		*(struct regcache **)r = rcp;
1477 		return (REG_OK);
1478 	}
1479 
1480 	if ((mbpattern = wcstombsdup((wchar_t *)pattern)) == NULL)
1481 		return (REG_ESPACE);
1482 
1483 	ret = regcomp(&re, mbpattern, REG_EXTENDED);
1484 
1485 	free(mbpattern);
1486 
1487 	if (ret != REG_OK)
1488 		return (ret);
1489 
1490 	if ((rcp = malloc(sizeof (struct regcache))) == NULL)
1491 		return (REG_ESPACE);
1492 	rcp->re = re;
1493 	if ((rcp->pattern = wsdup(pattern)) == NULL) {
1494 		regfree(&re);
1495 		free(rcp);
1496 		return (REG_ESPACE);
1497 	}
1498 	rcp->refcnt = 1;
1499 	insque(&rcp->lq, &reglink);
1500 	insque(&rcp->hash.hq, &reghash[key]);
1501 	rcp->hash.regcachep = rcp;
1502 
1503 	*(struct regcache **)r = rcp;
1504 	return (ret);
1505 }
1506 
1507 void
int_regwfree(REGEXP r)1508 int_regwfree(REGEXP r)
1509 {
1510 	int	cnt;
1511 	struct qelem *qp, *nqp;
1512 	struct regcache *rcp;
1513 
1514 	rcp = (struct regcache *)r;
1515 
1516 	if (--rcp->refcnt != 0)
1517 		return;
1518 
1519 	/* this cache has no reference */
1520 	if (++nregunref < NREGHOLD)
1521 		return;
1522 
1523 	/*
1524 	 * We've got too much unref'ed regex. Free half of least
1525 	 * used regex.
1526 	 */
1527 	cnt = 0;
1528 	for (qp = reglink.q_forw; qp != NULL; qp = nqp) {
1529 		nqp = qp->q_forw;
1530 		rcp = (struct regcache *)qp;
1531 		if (rcp->refcnt != 0)
1532 			continue;
1533 
1534 		/* free half of them */
1535 		if (++cnt < (NREGHOLD / 2))
1536 			continue;
1537 
1538 		/* detach and free */
1539 		remque(&rcp->lq);
1540 		remque(&rcp->hash.hq);
1541 
1542 		/* free up */
1543 		free(rcp->pattern);
1544 		regfree(&rcp->re);
1545 		free(rcp);
1546 
1547 		nregunref--;
1548 	}
1549 }
1550 
1551 size_t
int_regwerror(int errcode,REGEXP r,char * errbuf,size_t bufsiz)1552 int_regwerror(int errcode, REGEXP r, char *errbuf, size_t bufsiz)
1553 {
1554 	struct regcache *rcp;
1555 
1556 	rcp = (struct regcache *)r;
1557 	return (regerror(errcode, &rcp->re, errbuf, bufsiz));
1558 }
1559 
1560 int
int_regwexec(REGEXP r,const wchar_t * astring,size_t nsub,int_regwmatch_t * sub,int flags)1561 int_regwexec(REGEXP r,		/* compiled RE */
1562     const wchar_t *astring,	/* subject string */
1563     size_t nsub,		/* number of subexpressions */
1564     int_regwmatch_t *sub,	/* subexpression pointers */
1565     int flags)
1566 {
1567 	char *mbs;
1568 	regmatch_t *mbsub = NULL;
1569 	int i;
1570 	struct regcache *rcp;
1571 
1572 	if ((mbs = wcstombsdup((wchar_t *)astring)) == NULL)
1573 		return (REG_ESPACE);
1574 
1575 	if (nsub > 0 && sub) {
1576 		if ((mbsub = malloc(nsub * sizeof (regmatch_t))) == NULL)
1577 			return (REG_ESPACE);
1578 	}
1579 
1580 	rcp = (struct regcache *)r;
1581 
1582 	i = regexec(&rcp->re, mbs, nsub, mbsub, flags);
1583 
1584 	/* Now, adjust the pointers/counts in sub */
1585 	if (i == REG_OK && nsub > 0 && mbsub) {
1586 		int j, k;
1587 
1588 		for (j = 0; j < nsub; j++) {
1589 			regmatch_t *ms = &mbsub[j];
1590 			int_regwmatch_t *ws = &sub[j];
1591 
1592 			if ((k = ms->rm_so) >= 0) {
1593 				ws->rm_so = wcoff(astring, k);
1594 				ws->rm_sp = astring + ws->rm_so;
1595 			}
1596 			if ((k = ms->rm_eo) >= 0) {
1597 				ws->rm_eo = wcoff(astring, k);
1598 				ws->rm_ep = astring + ws->rm_eo;
1599 			}
1600 		}
1601 	}
1602 
1603 	free(mbs);
1604 	if (mbsub)
1605 		free(mbsub);
1606 	return (i);
1607 }
1608 
1609 int
int_regwdosuba(REGEXP rp,const wchar_t * rpl,const wchar_t * src,wchar_t ** dstp,int len,int * globp)1610 int_regwdosuba(REGEXP rp,	/* compiled RE: Pattern */
1611     const wchar_t *rpl,		/* replacement string: /rpl/ */
1612     const wchar_t *src,		/* source string */
1613     wchar_t **dstp,		/* destination string */
1614     int len,			/* destination length */
1615     int *globp)		/* IN: occurence, 0 for all; OUT: substitutions */
1616 {
1617 	wchar_t *dst, *odst;
1618 	const wchar_t *ip, *xp;
1619 	wchar_t *op;
1620 	int i;
1621 	wchar_t c;
1622 	int glob, iglob = *globp, oglob = 0;
1623 #define	NSUB	10
1624 	int_regwmatch_t rm[NSUB], *rmp;
1625 	int flags;
1626 	wchar_t *end;
1627 	int regerr;
1628 
1629 /* handle overflow of dst. we need "i" more bytes */
1630 #ifdef OVERFLOW
1631 #undef OVERFLOW
1632 #define	OVERFLOW(i) { \
1633 		int pos = op - dst; \
1634 		dst = (wchar_t *)realloc(odst = dst, \
1635 			(len += len + i) * sizeof (wchar_t)); \
1636 		if (dst == NULL) \
1637 			goto nospace; \
1638 		op = dst + pos; \
1639 		end = dst + len; \
1640 	}
1641 #endif
1642 
1643 	*dstp = dst = (wchar_t *)malloc(len * sizeof (wchar_t));
1644 	if (dst == NULL)
1645 		return (REG_ESPACE);
1646 
1647 	if (rp == NULL || rpl == NULL || src == NULL || dst ==  NULL)
1648 		return (REG_EFATAL);
1649 
1650 	glob = 0;	/* match count */
1651 	ip = src;	/* source position */
1652 	op = dst;	/* destination position */
1653 	end = dst + len;
1654 
1655 	flags = 0;
1656 	while ((regerr = int_regwexec(rp, ip, NSUB, rm, flags)) == REG_OK) {
1657 		/* Copy text preceding match */
1658 		if (op + (i = rm[0].rm_sp - ip) >= end)
1659 			OVERFLOW(i)
1660 		while (i--)
1661 			*op++ = *ip++;
1662 
1663 		if (iglob == 0 || ++glob == iglob) {
1664 			oglob++;
1665 			xp = rpl;		/* do substitute */
1666 		} else
1667 			xp = L"&";		/* preserve text */
1668 
1669 		/* Perform replacement of matched substing */
1670 		while ((c = *xp++) != '\0') {
1671 			rmp = NULL;
1672 			if (c == '&')
1673 				rmp = &rm[0];
1674 			else if (c == '\\') {
1675 				if ('0' <= *xp && *xp <= '9')
1676 					rmp = &rm[*xp++ - '0'];
1677 				else if (*xp != '\0')
1678 					c = *xp++;
1679 			}
1680 
1681 			if (rmp ==  NULL) {	/* Ordinary character. */
1682 				*op++ = c;
1683 				if (op >= end)
1684 					OVERFLOW(1)
1685 			} else if (rmp->rm_sp != NULL && rmp->rm_ep != NULL) {
1686 				ip = rmp->rm_sp;
1687 				if (op + (i = rmp->rm_ep - rmp->rm_sp) >= end)
1688 					OVERFLOW(i)
1689 				while (i--)
1690 					*op++ = *ip++;
1691 			}
1692 		}
1693 
1694 		ip = rm[0].rm_ep;
1695 		if (*ip == '\0')	/* If at end break */
1696 			break;
1697 		else if (rm[0].rm_sp == rm[0].rm_ep) {
1698 			/* If empty match copy next char */
1699 			*op++ = *ip++;
1700 			if (op >= end)
1701 				OVERFLOW(1)
1702 		}
1703 		flags = REG_NOTBOL;
1704 	}
1705 
1706 	if (regerr != REG_OK && regerr != REG_NOMATCH)
1707 		return (regerr);
1708 
1709 	/* Copy rest of text */
1710 	if (op + (i =  wcslen(ip)) >= end)
1711 		OVERFLOW(i)
1712 	while (i--)
1713 		*op++ = *ip++;
1714 	*op++ = '\0';
1715 
1716 	if ((*dstp = dst = (wchar_t *)realloc(odst = dst,
1717 	    sizeof (wchar_t) * (size_t)(op - dst))) == NULL) {
1718 nospace:
1719 		free(odst);
1720 		return (REG_ESPACE);
1721 	}
1722 
1723 	*globp = oglob;
1724 
1725 	return ((oglob == 0) ? REG_NOMATCH : REG_OK);
1726 }
1727