xref: /titanic_41/usr/src/cmd/awk_xpg4/awk1.c (revision 79777a7dd0179283917bda2ba98999c382d31c2c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 1986, 1994 by Mortice Kern Systems Inc.  All rights reserved.
28  */
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 /*
33  * awk -- mainline, yylex, etc.
34  *
35  * Based on MKS awk(1) ported to be /usr/xpg4/bin/awk with POSIX/XCU4 changes
36  */
37 
38 #include "awk.h"
39 #include "y.tab.h"
40 #include <stdarg.h>
41 #include <unistd.h>
42 #include <locale.h>
43 #include <search.h>
44 
45 static char	*progfiles[NPFILE];	/* Programmes files for yylex */
46 static char	**progfilep = &progfiles[0]; /* Pointer to last file */
47 static wchar_t	*progptr;		/* In-memory programme */
48 static int	proglen;		/* Length of progptr */
49 static wchar_t	context[NCONTEXT];	/* Circular buffer of context */
50 static wchar_t	*conptr = &context[0];	/* context ptr */
51 static FILE	*progfp;		/* Stdio stream for programme */
52 static char	*filename;
53 #ifdef	DEBUG
54 static int	dflag;
55 #endif
56 
57 #define	AWK_EXEC_MAGIC	"<MKS AWKC>"
58 #define	LEN_EXEC_MAGIC	10
59 
60 static char	unbal[] = "unbalanced E char";
61 
62 static void	awkarginit(int c, char **av);
63 static int	lexid(wint_t c);
64 static int	lexnumber(wint_t c);
65 static int	lexstring(wint_t endc);
66 static int	lexregexp(wint_t endc);
67 
68 static void	awkvarinit(void);
69 static wint_t	lexgetc(void);
70 static void	lexungetc(wint_t c);
71 static size_t	lexescape(wint_t endc, int regx, int cmd_line_operand);
72 static void	awkierr(int perr, char *fmt, va_list ap);
73 static int	usage(void);
74 void		strescape(wchar_t *str);
75 static const char	*toprint(wint_t);
76 char *_cmdname;
77 static wchar_t *mbconvert(char *str);
78 
79 extern int	isclvar(wchar_t *arg);
80 
81 /*
82  * mainline for awk
83  */
84 int
main(int argc,char * argv[])85 main(int argc, char *argv[])
86 {
87 	wchar_t *ap;
88 	char *cmd;
89 
90 	cmd = argv[0];
91 	_cmdname = cmd;
92 
93 	linebuf = emalloc(NLINE * sizeof (wchar_t));
94 
95 	/*
96 	 * At this point only messaging should be internationalized.
97 	 * numbers are still scanned as in the Posix locale.
98 	 */
99 	(void) setlocale(LC_ALL, "");
100 	(void) setlocale(LC_NUMERIC, "C");
101 #if !defined(TEXT_DOMAIN)
102 #define	TEXT_DOMAIN	"SYS_TEST"
103 #endif
104 	(void) textdomain(TEXT_DOMAIN);
105 
106 	awkvarinit();
107 	/* running = 1; */
108 	while (argc > 1 && *argv[1] == '-') {
109 		void *save_ptr = NULL;
110 		ap = mbstowcsdup(&argv[1][1]);
111 		if (ap == NULL)
112 			break;
113 		if (*ap == '\0') {
114 			free(ap);
115 			break;
116 		}
117 		save_ptr = (void *) ap;
118 		++argv;
119 		--argc;
120 		if (*ap == '-' && ap[1] == '\0')
121 			break;
122 		for (; *ap != '\0'; ++ap) {
123 			switch (*ap) {
124 #ifdef DEBUG
125 			case 'd':
126 				dflag = 1;
127 				continue;
128 
129 #endif
130 			case 'f':
131 				if (argc < 2) {
132 					(void) fprintf(stderr,
133 				gettext("Missing script file\n"));
134 					return (1);
135 				}
136 				*progfilep++ = argv[1];
137 				--argc;
138 				++argv;
139 				continue;
140 
141 			case 'F':
142 				if (ap[1] == '\0') {
143 					if (argc < 2) {
144 						(void) fprintf(stderr,
145 				gettext("Missing field separator\n"));
146 						return (1);
147 					}
148 					ap = mbstowcsdup(argv[1]);
149 					--argc;
150 					++argv;
151 				} else
152 					++ap;
153 				strescape(ap);
154 				strassign(varFS, linebuf, FALLOC,
155 				    wcslen(linebuf));
156 				break;
157 
158 			case 'v': {
159 				wchar_t *vp;
160 				wchar_t *arg;
161 
162 				if (argc < 2) {
163 					(void) fprintf(stderr,
164 		gettext("Missing variable assignment\n"));
165 					return (1);
166 				}
167 				arg = mbconvert(argv[1]);
168 				/*
169 				 * Ensure the variable expression
170 				 * is valid (correct form).
171 				 */
172 				if (((vp = wcschr(arg, '=')) != NULL) &&
173 				    isclvar(arg)) {
174 					*vp = '\0';
175 					strescape(vp+1);
176 					strassign(vlook(arg), linebuf,
177 					    FALLOC|FSENSE,
178 					    wcslen(linebuf));
179 					*vp = '=';
180 				} else {
181 					(void) fprintf(stderr, gettext(
182 					    "Invalid form for variable "
183 					    "assignment: %S\n"), arg);
184 					return (1);
185 				}
186 				--argc;
187 				++argv;
188 				continue;
189 			}
190 
191 			default:
192 				(void) fprintf(stderr,
193 				gettext("Unknown option \"-%S\"\n"), ap);
194 				return (usage());
195 			}
196 			break;
197 		}
198 		if (save_ptr)
199 			free(save_ptr);
200 	}
201 	if (progfilep == &progfiles[0]) {
202 		if (argc < 2)
203 			return (usage());
204 		filename = "[command line]";	/* BUG: NEEDS TRANSLATION */
205 		progptr = mbstowcsdup(argv[1]);
206 		proglen = wcslen(progptr);
207 		--argc;
208 		++argv;
209 	}
210 
211 	argv[0] = cmd;
212 
213 	awkarginit(argc, argv);
214 
215 	/* running = 0; */
216 	(void) yyparse();
217 
218 	lineno = 0;
219 	/*
220 	 * Ok, done parsing, so now activate the rest of the nls stuff, set
221 	 * the radix character.
222 	 */
223 	(void) setlocale(LC_ALL, "");
224 	radixpoint = *localeconv()->decimal_point;
225 	awk();
226 	/* NOTREACHED */
227 	return (0);
228 }
229 
230 /*
231  * Do initial setup of buffers, etc.
232  * This must be called before most processing
233  * and especially before lexical analysis.
234  * Variables initialised here will be overruled by command
235  * line parameter initialisation.
236  */
237 static void
awkvarinit()238 awkvarinit()
239 {
240 	NODE *np;
241 
242 	(void) setvbuf(stderr, NULL, _IONBF, 0);
243 
244 	if ((NIOSTREAM = sysconf(_SC_OPEN_MAX) - 4) <= 0) {
245 		(void) fprintf(stderr,
246 	gettext("not enough available file descriptors"));
247 		exit(1);
248 	}
249 	ofiles = (OFILE *)emalloc(sizeof (OFILE)*NIOSTREAM);
250 #ifdef A_ZERO_POINTERS
251 	(void) memset((wchar_t *)ofiles, 0, sizeof (OFILE) * NIOSTREAM);
252 #else
253 	{
254 		/* initialize file descriptor table */
255 		OFILE *fp;
256 		for (fp = ofiles; fp < &ofiles[NIOSTREAM]; fp += 1) {
257 			fp->f_fp = FNULL;
258 					fp->f_mode = 0;
259 					fp->f_name = (char *)0;
260 		}
261 	}
262 #endif
263 	constant = intnode((INT)0);
264 
265 	const0 = intnode((INT)0);
266 	const1 = intnode((INT)1);
267 	constundef = emptynode(CONSTANT, 0);
268 	constundef->n_flags = FSTRING|FVINT;
269 	constundef->n_string = _null;
270 	constundef->n_strlen = 0;
271 	inc_oper = emptynode(ADD, 0);
272 	inc_oper->n_right = const1;
273 	asn_oper = emptynode(ADD, 0);
274 	field0 = node(FIELD, const0, NNULL);
275 
276 	{
277 		RESFUNC near*rp;
278 
279 		for (rp = &resfuncs[0]; rp->rf_name != (LOCCHARP)NULL; ++rp) {
280 			np = finstall(rp->rf_name, rp->rf_func, rp->rf_type);
281 		}
282 	}
283 	{
284 		RESERVED near*rp;
285 
286 		for (rp = &reserved[0]; rp->r_name != (LOCCHARP)NULL; ++rp) {
287 			switch (rp->r_type) {
288 			case SVAR:
289 			case VAR:
290 				running = 1;
291 				np = vlook(rp->r_name);
292 				if (rp->r_type == SVAR)
293 					np->n_flags |= FSPECIAL;
294 				if (rp->r_svalue != NULL)
295 					strassign(np, rp->r_svalue, FSTATIC,
296 					    (size_t)rp->r_ivalue);
297 				else {
298 					constant->n_int = rp->r_ivalue;
299 					(void) assign(np, constant);
300 				}
301 				running = 0;
302 				break;
303 
304 			case KEYWORD:
305 				kinstall(rp->r_name, (int)rp->r_ivalue);
306 				break;
307 			}
308 		}
309 	}
310 
311 	varNR = vlook(s_NR);
312 	varFNR = vlook(s_FNR);
313 	varNF = vlook(s_NF);
314 	varOFMT = vlook(s_OFMT);
315 	varCONVFMT = vlook(s_CONVFMT);
316 	varOFS = vlook(s_OFS);
317 	varORS = vlook(s_ORS);
318 	varRS = vlook(s_RS);
319 	varFS = vlook(s_FS);
320 	varARGC = vlook(s_ARGC);
321 	varSUBSEP = vlook(s_SUBSEP);
322 	varENVIRON = vlook(s_ENVIRON);
323 	varFILENAME = vlook(s_FILENAME);
324 	varSYMTAB = vlook(s_SYMTAB);
325 	incNR = node(ASG, varNR, node(ADD, varNR, const1));
326 	incFNR = node(ASG, varFNR, node(ADD, varFNR, const1));
327 	clrFNR = node(ASG, varFNR, const0);
328 }
329 
330 /*
331  * Initialise awk ARGC, ARGV variables.
332  */
333 static void
awkarginit(int ac,char ** av)334 awkarginit(int ac, char **av)
335 {
336 	int i;
337 	wchar_t *cp;
338 
339 	ARGVsubi = node(INDEX, vlook(s_ARGV), constant);
340 	running = 1;
341 	constant->n_int = ac;
342 	(void) assign(varARGC, constant);
343 	for (i = 0; i < ac; ++i) {
344 		cp = mbstowcsdup(av[i]);
345 		constant->n_int = i;
346 		strassign(exprreduce(ARGVsubi), cp,
347 		    FSTATIC|FSENSE, wcslen(cp));
348 	}
349 	running = 0;
350 }
351 
352 /*
353  * Clean up when done parsing a function.
354  * All formal parameters, because of a deal (funparm) in
355  * yylex, get put into the symbol table in front of any
356  * global variable of the same name.  When the entire
357  * function is parsed, remove these formal dummy nodes
358  * from the symbol table but retain the nodes because
359  * the generated tree points at them.
360  */
361 void
uexit(NODE * np)362 uexit(NODE *np)
363 {
364 	NODE *formal;
365 
366 	while ((formal = getlist(&np)) != NNULL)
367 		delsymtab(formal, 0);
368 }
369 
370 /*
371  * The lexical analyzer.
372  */
373 int
yylex()374 yylex()
375 #ifdef	DEBUG
376 {
377 	int l;
378 
379 	l = yyhex();
380 	if (dflag)
381 		(void) printf("%d\n", l);
382 	return (l);
383 }
yyhex()384 yyhex()
385 #endif
386 {
387 	wint_t c, c1;
388 	int i;
389 	static int savetoken = 0;
390 	static int wasfield;
391 	static int isfuncdef;
392 	static int nbrace, nparen, nbracket;
393 	static struct ctosymstruct {
394 		wint_t c, sym;
395 	} ctosym[] = {
396 		{ '|', BAR },		{ '^', CARAT },
397 		{ '~', TILDE },		{ '<', LANGLE },
398 		{ '>', RANGLE },	{ '+', PLUSC },
399 		{ '-', HYPHEN },	{ '*', STAR },
400 		{ '/', SLASH },		{ '%', PERCENT },
401 		{ '!', EXCLAMATION },	{ '$', DOLLAR },
402 		{ '[', LSQUARE },	{ ']', RSQUARE },
403 		{ '(', LPAREN },	{ ')', RPAREN },
404 		{ ';', SEMI },		{ '{', LBRACE },
405 		{ '}', RBRACE },	{   0, 0 }
406 	};
407 
408 	if (savetoken) {
409 		c = savetoken;
410 		savetoken = 0;
411 	} else if (redelim != '\0') {
412 		c = redelim;
413 		redelim = 0;
414 		catterm = 0;
415 		savetoken = c;
416 		return (lexlast = lexregexp(c));
417 	} else while ((c = lexgetc()) != WEOF) {
418 		if (iswalpha(c) || c == '_') {
419 			c = lexid(c);
420 		} else if (iswdigit(c) || c == '.') {
421 			c = lexnumber(c);
422 		} else if (isWblank(c)) {
423 			continue;
424 		} else switch (c) {
425 #if DOS || OS2
426 		case 032:		/* ^Z */
427 			continue;
428 #endif
429 
430 		case '"':
431 			c = lexstring(c);
432 			break;
433 
434 		case '#':
435 			while ((c = lexgetc()) != '\n' && c != WEOF)
436 				;
437 			lexungetc(c);
438 			continue;
439 
440 		case '+':
441 			if ((c1 = lexgetc()) == '+')
442 				c = INC;
443 			else if (c1 == '=')
444 				c = AADD;
445 			else
446 				lexungetc(c1);
447 			break;
448 
449 		case '-':
450 			if ((c1 = lexgetc()) == '-')
451 				c = DEC;
452 			else if (c1 == '=')
453 				c = ASUB;
454 			else
455 				lexungetc(c1);
456 			break;
457 
458 		case '*':
459 			if ((c1 = lexgetc()) == '=')
460 				c = AMUL;
461 			else if (c1 == '*') {
462 				if ((c1 = lexgetc()) == '=')
463 					c = AEXP;
464 				else {
465 					c = EXP;
466 					lexungetc(c1);
467 				}
468 			} else
469 				lexungetc(c1);
470 			break;
471 
472 		case '^':
473 			if ((c1 = lexgetc()) == '=') {
474 				c = AEXP;
475 			} else {
476 				c = EXP;
477 				lexungetc(c1);
478 			}
479 			break;
480 
481 		case '/':
482 			if ((c1 = lexgetc()) == '=' &&
483 			    lexlast != RE && lexlast != NRE &&
484 			    lexlast != ';' && lexlast != '\n' &&
485 			    lexlast != ',' && lexlast != '(')
486 				c = ADIV;
487 			else
488 				lexungetc(c1);
489 			break;
490 
491 		case '%':
492 			if ((c1 = lexgetc()) == '=')
493 				c = AREM;
494 			else
495 				lexungetc(c1);
496 			break;
497 
498 		case '&':
499 			if ((c1 = lexgetc()) == '&')
500 				c = AND;
501 			else
502 				lexungetc(c1);
503 			break;
504 
505 		case '|':
506 			if ((c1 = lexgetc()) == '|')
507 				c = OR;
508 			else {
509 				lexungetc(c1);
510 				if (inprint)
511 					c = PIPE;
512 			}
513 			break;
514 
515 		case '>':
516 			if ((c1 = lexgetc()) == '=')
517 				c = GE;
518 			else if (c1 == '>')
519 				c = APPEND;
520 			else {
521 				lexungetc(c1);
522 				if (nparen == 0 && inprint)
523 					c = WRITE;
524 			}
525 			break;
526 
527 		case '<':
528 			if ((c1 = lexgetc()) == '=')
529 				c = LE;
530 			else
531 				lexungetc(c1);
532 			break;
533 
534 		case '!':
535 			if ((c1 = lexgetc()) == '=')
536 				c = NE;
537 			else if (c1 == '~')
538 				c = NRE;
539 			else
540 				lexungetc(c1);
541 			break;
542 
543 		case '=':
544 			if ((c1 = lexgetc()) == '=')
545 				c = EQ;
546 			else {
547 				lexungetc(c1);
548 				c = ASG;
549 			}
550 			break;
551 
552 		case '\n':
553 			switch (lexlast) {
554 			case ')':
555 				if (catterm || inprint) {
556 					c = ';';
557 					break;
558 				}
559 			/*FALLTHRU*/
560 			case AND:
561 			case OR:
562 			case COMMA:
563 			case '{':
564 			case ELSE:
565 			case ';':
566 			case DO:
567 				continue;
568 
569 			case '}':
570 				if (nbrace != 0)
571 					continue;
572 
573 			default:
574 				c = ';';
575 				break;
576 			}
577 			break;
578 
579 		case ELSE:
580 			if (lexlast != ';') {
581 				savetoken = ELSE;
582 				c = ';';
583 			}
584 			break;
585 
586 		case '(':
587 			++nparen;
588 			break;
589 
590 		case ')':
591 			if (--nparen < 0)
592 				awkerr(unbal, "()");
593 			break;
594 
595 		case '{':
596 			nbrace++;
597 			break;
598 
599 		case '}':
600 			if (--nbrace < 0) {
601 				char brk[3];
602 
603 				brk[0] = '{';
604 				brk[1] = '}';
605 				brk[2] = '\0';
606 				awkerr(unbal, brk);
607 			}
608 			if (lexlast != ';') {
609 				savetoken = c;
610 				c = ';';
611 			}
612 			break;
613 
614 		case '[':
615 			++nbracket;
616 			break;
617 
618 		case ']':
619 			if (--nbracket < 0) {
620 				char brk[3];
621 
622 				brk[0] = '[';
623 				brk[1] = ']';
624 				brk[2] = '\0';
625 				awkerr(unbal, brk);
626 			}
627 			break;
628 
629 		case '\\':
630 			if ((c1 = lexgetc()) == '\n')
631 				continue;
632 			lexungetc(c1);
633 			break;
634 
635 		case ',':
636 			c = COMMA;
637 			break;
638 
639 		case '?':
640 			c = QUEST;
641 			break;
642 
643 		case ':':
644 			c = COLON;
645 			break;
646 
647 		default:
648 			if (!iswprint(c))
649 				awkerr(
650 				    gettext("invalid character \"%s\""),
651 				    toprint(c));
652 			break;
653 		}
654 		break;
655 	}
656 
657 	switch (c) {
658 	case ']':
659 		++catterm;
660 		break;
661 
662 	case VAR:
663 		if (catterm) {
664 			savetoken = c;
665 			c = CONCAT;
666 			catterm = 0;
667 		} else if (!isfuncdef) {
668 			if ((c1 = lexgetc()) != '(')
669 				++catterm;
670 			lexungetc(c1);
671 		}
672 		isfuncdef = 0;
673 		break;
674 
675 	case PARM:
676 	case CONSTANT:
677 		if (catterm) {
678 			savetoken = c;
679 			c = CONCAT;
680 			catterm = 0;
681 		} else {
682 			if (lexlast == '$')
683 				wasfield = 2;
684 			++catterm;
685 		}
686 		break;
687 
688 	case INC:
689 	case DEC:
690 		if (!catterm || lexlast != CONSTANT || wasfield)
691 			break;
692 
693 	/*FALLTHRU*/
694 	case UFUNC:
695 	case FUNC:
696 	case GETLINE:
697 	case '!':
698 	case '$':
699 	case '(':
700 		if (catterm) {
701 			savetoken = c;
702 			c = CONCAT;
703 			catterm = 0;
704 		}
705 		break;
706 
707 	/* { */ case '}':
708 		if (nbrace == 0)
709 			savetoken = ';';
710 	/*FALLTHRU*/
711 	case ';':
712 		inprint = 0;
713 	/*FALLTHRU*/
714 	default:
715 		if (c == DEFFUNC)
716 			isfuncdef = 1;
717 		catterm = 0;
718 	}
719 	lexlast = c;
720 	if (wasfield)
721 		wasfield--;
722 	/*
723 	 * Map character constants to symbolic names.
724 	 */
725 	for (i = 0; ctosym[i].c != 0; i++)
726 		if (c == ctosym[i].c) {
727 			c = ctosym[i].sym;
728 			break;
729 		}
730 	return ((int)c);
731 }
732 
733 /*
734  * Read a number for the lexical analyzer.
735  * Input is the first character of the number.
736  * Return value is the lexical type.
737  */
738 static int
lexnumber(wint_t c)739 lexnumber(wint_t c)
740 {
741 	wchar_t *cp;
742 	int dotfound = 0;
743 	int efound = 0;
744 	INT number;
745 
746 	cp = linebuf;
747 	do {
748 		if (iswdigit(c))
749 			;
750 		else if (c == '.') {
751 			if (dotfound++)
752 				break;
753 		} else if (c == 'e' || c == 'E') {
754 			if ((c = lexgetc()) != '-' && c != '+') {
755 				lexungetc(c);
756 				c = 'e';
757 			} else
758 				*cp++ = 'e';
759 			if (efound++)
760 				break;
761 		} else
762 			break;
763 		*cp++ = c;
764 	} while ((c = lexgetc()) != WEOF);
765 	*cp = '\0';
766 	if (dotfound && cp == linebuf+1)
767 		return (DOT);
768 	lexungetc(c);
769 	errno = 0;
770 	if (!dotfound && !efound &&
771 	    ((number = wcstol(linebuf, (wchar_t **)0, 10)), errno != ERANGE))
772 		yylval.node = intnode(number);
773 	else
774 		yylval.node = realnode((REAL)wcstod(linebuf, (wchar_t **)0));
775 	return (CONSTANT);
776 }
777 
778 /*
779  * Read an identifier.
780  * Input is first character of identifier.
781  * Return VAR.
782  */
783 static int
lexid(wint_t c)784 lexid(wint_t c)
785 {
786 	wchar_t *cp;
787 	size_t i;
788 	NODE *np;
789 
790 	cp = linebuf;
791 	do {
792 		*cp++ = c;
793 		c = lexgetc();
794 	} while (iswalpha(c) || iswdigit(c) || c == '_');
795 	*cp = '\0';
796 	lexungetc(c);
797 	yylval.node = np = vlook(linebuf);
798 
799 	switch (np->n_type) {
800 	case KEYWORD:
801 		switch (np->n_keywtype) {
802 		case PRINT:
803 		case PRINTF:
804 			++inprint;
805 		default:
806 			return ((int)np->n_keywtype);
807 		}
808 		/* NOTREACHED */
809 
810 	case ARRAY:
811 	case VAR:
812 		/*
813 		 * If reading the argument list, create a dummy node
814 		 * for the duration of that function. These variables
815 		 * can be removed from the symbol table at function end
816 		 * but they must still exist because the execution tree
817 		 * knows about them.
818 		 */
819 		if (funparm) {
820 do_funparm:
821 			np = emptynode(PARM, i = (cp-linebuf));
822 			np->n_flags = FSTRING;
823 			np->n_string = _null;
824 			np->n_strlen = 0;
825 			(void) memcpy(np->n_name, linebuf,
826 			    (i+1) * sizeof (wchar_t));
827 			addsymtab(np);
828 			yylval.node = np;
829 		} else if (np == varNF || (np == varFS &&
830 		    (!doing_begin || begin_getline))) {
831 			/*
832 			 * If the user program references NF or sets
833 			 * FS either outside of a begin block or
834 			 * in a begin block after a getline then the
835 			 * input line will be split immediately upon read
836 			 * rather than when a field is first referenced.
837 			 */
838 			needsplit = 1;
839 		} else if (np == varENVIRON)
840 			needenviron = 1;
841 	/*FALLTHRU*/
842 	case PARM:
843 		return (VAR);
844 
845 	case UFUNC:
846 		/*
847 		 * It is ok to redefine functions as parameters
848 		 */
849 		if (funparm) goto do_funparm;
850 	/*FALLTHRU*/
851 	case FUNC:
852 	case GETLINE:
853 		/*
854 		 * When a getline is encountered, clear the 'doing_begin' flag.
855 		 * This will force the 'needsplit' flag to be set, even inside
856 		 * a begin block, if FS is altered. (See VAR case above)
857 		 */
858 		if (doing_begin)
859 			begin_getline = 1;
860 		return (np->n_type);
861 	}
862 	/* NOTREACHED */
863 	return (0);
864 }
865 
866 /*
867  * Read a string for the lexical analyzer.
868  * `endc' terminates the string.
869  */
870 static int
lexstring(wint_t endc)871 lexstring(wint_t endc)
872 {
873 	size_t length = lexescape(endc, 0, 0);
874 
875 	yylval.node = stringnode(linebuf, FALLOC, length);
876 	return (CONSTANT);
877 }
878 
879 /*
880  * Read a regular expression.
881  */
882 static int
lexregexp(wint_t endc)883 lexregexp(wint_t endc)
884 {
885 	(void) lexescape(endc, 1, 0);
886 	yylval.node = renode(linebuf);
887 	return (URE);
888 }
889 
890 /*
891  * Process a string, converting the escape characters as required by
892  * 1003.2. The processed string ends up in the global linebuf[]. This
893  * routine also changes the value of 'progfd' - the program file
894  * descriptor, so it should be used with some care. It is presently used to
895  * process -v (awk1.c) and var=str type arguments (awk2.c, nextrecord()).
896  */
897 void
strescape(wchar_t * str)898 strescape(wchar_t *str)
899 {
900 	progptr = str;
901 	proglen = wcslen(str) + 1;	/* Include \0 */
902 	(void) lexescape('\0', 0, 1);
903 	progptr = NULL;
904 }
905 
906 /*
907  * Read a string or regular expression, terminated by ``endc'',
908  * for lexical analyzer, processing escape sequences.
909  * Return string length.
910  */
911 static size_t
lexescape(wint_t endc,int regx,int cmd_line_operand)912 lexescape(wint_t endc, int regx, int cmd_line_operand)
913 {
914 	static char nlre[256];
915 	static char nlstr[256];
916 	static char eofre[256];
917 	static char eofstr[256];
918 	int first_time = 1;
919 	wint_t c;
920 	wchar_t *cp;
921 	int n, max;
922 
923 	if (first_time == 1) {
924 		(void) strcpy(nlre, gettext("Newline in regular expression\n"));
925 		(void) strcpy(nlstr, gettext("Newline in string\n"));
926 		(void) strcpy(eofre, gettext("EOF in regular expression\n"));
927 		(void) strcpy(eofstr, gettext("EOF in string\n"));
928 		first_time = 0;
929 	}
930 
931 	cp = linebuf;
932 	while ((c = lexgetc()) != endc) {
933 		if (c == '\n')
934 			awkerr(regx ? nlre : nlstr);
935 		if (c == '\\') {
936 			switch (c = lexgetc(), c) {
937 			case '\\':
938 				if (regx)
939 					*cp++ = '\\';
940 				break;
941 
942 			case '/':
943 				c = '/';
944 				break;
945 
946 			case 'n':
947 				c = '\n';
948 				break;
949 
950 			case 'b':
951 				c = '\b';
952 				break;
953 
954 			case 't':
955 				c = '\t';
956 				break;
957 
958 			case 'r':
959 				c = '\r';
960 				break;
961 
962 			case 'f':
963 				c = '\f';
964 				break;
965 
966 			case 'v':
967 				c = '\v';
968 				break;
969 
970 			case 'a':
971 				c = (char)0x07;
972 				break;
973 
974 			case 'x':
975 				n = 0;
976 				while (iswxdigit(c = lexgetc())) {
977 					if (iswdigit(c))
978 						c -= '0';
979 					else if (iswupper(c))
980 						c -= 'A'-10;
981 					else
982 						c -= 'a'-10;
983 					n = (n<<4) + c;
984 				}
985 				lexungetc(c);
986 				c = n;
987 				break;
988 
989 			case '0':
990 			case '1':
991 			case '2':
992 			case '3':
993 			case '4':
994 			case '5':
995 			case '6':
996 			case '7':
997 #if 0
998 /*
999  * Posix.2 draft 10 disallows the use of back-referencing - it explicitly
1000  * requires processing of the octal escapes both in strings and
1001  * regular expressions. The following code is disabled instead of
1002  * removed as back-referencing may be reintroduced in a future draft
1003  * of the standard.
1004  */
1005 				/*
1006 				 * For regular expressions, we disallow
1007 				 * \ooo to mean octal character, in favour
1008 				 * of back referencing.
1009 				 */
1010 				if (regx) {
1011 					*cp++ = '\\';
1012 					break;
1013 				}
1014 #endif
1015 				max = 3;
1016 				n = 0;
1017 				do {
1018 					n = (n<<3) + c-'0';
1019 					if ((c = lexgetc()) > '7' || c < '0')
1020 						break;
1021 				} while (--max);
1022 				lexungetc(c);
1023 				/*
1024 				 * an octal escape sequence must have at least
1025 				 * 2 digits after the backslash, otherwise
1026 				 * it gets passed straight thru for possible
1027 				 * use in backreferencing.
1028 				 */
1029 				if (max == 3) {
1030 					*cp++ = '\\';
1031 					n += '0';
1032 				}
1033 				c = n;
1034 				break;
1035 
1036 			case '\n':
1037 				continue;
1038 
1039 			default:
1040 				if (c != endc || cmd_line_operand) {
1041 					*cp++ = '\\';
1042 					if (c == endc)
1043 						lexungetc(c);
1044 				}
1045 			}
1046 		}
1047 		if (c == WEOF)
1048 			awkerr(regx ? eofre : eofstr);
1049 		*cp++ = c;
1050 	}
1051 	*cp = '\0';
1052 	return (cp - linebuf);
1053 }
1054 
1055 /*
1056  * Build a regular expression NODE.
1057  * Argument is the string holding the expression.
1058  */
1059 NODE *
renode(wchar_t * s)1060 renode(wchar_t *s)
1061 {
1062 	NODE *np;
1063 	int n;
1064 
1065 	np = emptynode(RE, 0);
1066 	np->n_left = np->n_right = NNULL;
1067 	if ((n = REGWCOMP(&np->n_regexp, s)) != REG_OK) {
1068 		int m;
1069 		char *p;
1070 
1071 		m = REGWERROR(n, np->n_regexp, NULL, 0);
1072 		p = (char *)emalloc(m);
1073 		REGWERROR(n, np->n_regexp, p, m);
1074 		awkerr("/%S/: %s", s, p);
1075 	}
1076 	return (np);
1077 }
1078 /*
1079  * Get a character for the lexical analyser routine.
1080  */
1081 static wint_t
lexgetc()1082 lexgetc()
1083 {
1084 	wint_t c;
1085 	static char **files = &progfiles[0];
1086 
1087 	if (progfp != FNULL && (c = fgetwc(progfp)) != WEOF)
1088 		;
1089 	else {
1090 		if (progptr != NULL) {
1091 			if (proglen-- <= 0)
1092 				c = WEOF;
1093 			else
1094 				c = *progptr++;
1095 		} else {
1096 			if (progfp != FNULL)
1097 				if (progfp != stdin)
1098 					(void) fclose(progfp);
1099 				else
1100 					clearerr(progfp);
1101 				progfp = FNULL;
1102 			if (files < progfilep) {
1103 				filename = *files++;
1104 				lineno = 1;
1105 				if (filename[0] == '-' && filename[1] == '\0')
1106 					progfp = stdin;
1107 				else if ((progfp = fopen(filename, r))
1108 				    == FNULL) {
1109 					(void) fprintf(stderr,
1110 				gettext("script file \"%s\""), filename);
1111 					exit(1);
1112 				}
1113 				c = fgetwc(progfp);
1114 			}
1115 		}
1116 	}
1117 	if (c == '\n')
1118 		++lineno;
1119 	if (conptr >= &context[NCONTEXT])
1120 		conptr = &context[0];
1121 	if (c != WEOF)
1122 		*conptr++ = c;
1123 	return (c);
1124 }
1125 
1126 /*
1127  * Return a character for lexical analyser.
1128  * Only one returned character is (not enforced) legitimite.
1129  */
1130 static void
lexungetc(wint_t c)1131 lexungetc(wint_t c)
1132 {
1133 	if (c == '\n')
1134 		--lineno;
1135 	if (c != WEOF) {
1136 		if (conptr == &context[0])
1137 			conptr = &context[NCONTEXT];
1138 		*--conptr = '\0';
1139 	}
1140 	if (progfp != FNULL) {
1141 		(void) ungetwc(c, progfp);
1142 		return;
1143 	}
1144 	if (c == WEOF)
1145 		return;
1146 	*--progptr = c;
1147 	proglen++;
1148 }
1149 
1150 /*
1151  * Syntax errors during parsing.
1152  */
1153 void
yyerror(char * s,...)1154 yyerror(char *s, ...)
1155 {
1156 	if (lexlast == FUNC || lexlast == GETLINE || lexlast == KEYWORD)
1157 		if (lexlast == KEYWORD)
1158 			awkerr(gettext("inadmissible use of reserved keyword"));
1159 		else
1160 			awkerr(gettext("attempt to redefine builtin function"));
1161 	awkerr(s);
1162 }
1163 
1164 /*
1165  * Error routine for all awk errors.
1166  */
1167 /* ARGSUSED */
1168 void
awkerr(char * fmt,...)1169 awkerr(char *fmt, ...)
1170 {
1171 	va_list args;
1172 
1173 	va_start(args, fmt);
1174 	awkierr(0, fmt, args);
1175 	va_end(args);
1176 }
1177 
1178 /*
1179  * Error routine like "awkerr" except that it prints out
1180  * a message that includes an errno-specific indication.
1181  */
1182 /* ARGSUSED */
1183 void
awkperr(char * fmt,...)1184 awkperr(char *fmt, ...)
1185 {
1186 	va_list args;
1187 
1188 	va_start(args, fmt);
1189 	awkierr(1, fmt, args);
1190 	va_end(args);
1191 }
1192 
1193 /*
1194  * Common internal routine for awkerr, awkperr
1195  */
1196 static void
awkierr(int perr,char * fmt,va_list ap)1197 awkierr(int perr, char *fmt, va_list ap)
1198 {
1199 	static char sep1[] = "\n>>>\t";
1200 	static char sep2[] = "\t<<<";
1201 	int saveerr = errno;
1202 
1203 	(void) fprintf(stderr, "%s: ", _cmdname);
1204 	if (running) {
1205 		(void) fprintf(stderr, gettext("line %u ("),
1206 		    curnode == NNULL ? 0 : curnode->n_lineno);
1207 		if (phase == 0)
1208 			(void) fprintf(stderr, "NR=%lld): ",
1209 			    (INT)exprint(varNR));
1210 		else
1211 			(void) fprintf(stderr, "%s): ",
1212 			    phase == BEGIN ? s_BEGIN : s_END);
1213 	} else if (lineno != 0) {
1214 		(void) fprintf(stderr, gettext("file \"%s\": "), filename);
1215 		(void) fprintf(stderr, gettext("line %u: "), lineno);
1216 	}
1217 	(void) vfprintf(stderr, gettext(fmt), ap);
1218 	if (perr == 1)
1219 		(void) fprintf(stderr, ": %s", strerror(saveerr));
1220 	if (perr != 2 && !running) {
1221 		wchar_t *cp;
1222 		int n;
1223 		int c;
1224 
1225 		(void) fprintf(stderr, gettext("  Context is:%s"), sep1);
1226 		cp = conptr;
1227 		n = NCONTEXT;
1228 		do {
1229 			if (cp >= &context[NCONTEXT])
1230 				cp = &context[0];
1231 			if ((c = *cp++) != '\0')
1232 				(void) fputs(c == '\n' ? sep1 : toprint(c),
1233 				    stderr);
1234 		} while (--n != 0);
1235 		(void) fputs(sep2, stderr);
1236 	}
1237 	(void) fprintf(stderr, "\n");
1238 	exit(1);
1239 }
1240 
1241 wchar_t *
emalloc(unsigned n)1242 emalloc(unsigned n)
1243 {
1244 	wchar_t *cp;
1245 
1246 	if ((cp = malloc(n)) == NULL)
1247 		awkerr(nomem);
1248 	return (cp);
1249 }
1250 
1251 wchar_t *
erealloc(wchar_t * p,unsigned n)1252 erealloc(wchar_t *p, unsigned n)
1253 {
1254 	wchar_t *cp;
1255 
1256 	if ((cp = realloc(p, n)) == NULL)
1257 		awkerr(nomem);
1258 	return (cp);
1259 }
1260 
1261 
1262 /*
1263  * usage message for awk
1264  */
1265 static int
usage()1266 usage()
1267 {
1268 	(void) fprintf(stderr, gettext(
1269 "Usage:	awk [-F ERE] [-v var=val] 'program' [var=val ...] [file ...]\n"
1270 "	awk [-F ERE] -f progfile ... [-v var=val] [var=val ...] [file ...]\n"));
1271 	return (2);
1272 }
1273 
1274 
1275 static wchar_t *
mbconvert(char * str)1276 mbconvert(char *str)
1277 {
1278 	static wchar_t *op = 0;
1279 
1280 	if (op != 0)
1281 		free(op);
1282 	return (op = mbstowcsdup(str));
1283 }
1284 
1285 char *
mbunconvert(wchar_t * str)1286 mbunconvert(wchar_t *str)
1287 {
1288 	static char *op = 0;
1289 
1290 	if (op != 0)
1291 		free(op);
1292 	return (op = wcstombsdup(str));
1293 }
1294 
1295 /*
1296  * Solaris port - following functions are typical MKS functions written
1297  * to work for Solaris.
1298  */
1299 
1300 wchar_t *
mbstowcsdup(s)1301 mbstowcsdup(s)
1302 char *s;
1303 {
1304 	int n;
1305 	wchar_t *w;
1306 
1307 	n = strlen(s) + 1;
1308 	if ((w = (wchar_t *)malloc(n * sizeof (wchar_t))) == NULL)
1309 		return (NULL);
1310 
1311 	if (mbstowcs(w, s, n) == (size_t)-1)
1312 		return (NULL);
1313 	return (w);
1314 
1315 }
1316 
1317 char *
wcstombsdup(wchar_t * w)1318 wcstombsdup(wchar_t *w)
1319 {
1320 	int n;
1321 	char *mb;
1322 
1323 	/* Fetch memory for worst case string length */
1324 	n = wslen(w) + 1;
1325 	n *= MB_CUR_MAX;
1326 	if ((mb = (char *)malloc(n)) == NULL) {
1327 		return (NULL);
1328 	}
1329 
1330 	/* Convert the string */
1331 	if ((n = wcstombs(mb, w, n)) == -1) {
1332 		int saverr = errno;
1333 
1334 		free(mb);
1335 		errno = saverr;
1336 		return (0);
1337 	}
1338 
1339 	/* Shrink the string down */
1340 	if ((mb = (char *)realloc(mb, strlen(mb)+1)) == NULL)  {
1341 		return (NULL);
1342 	}
1343 	return (mb);
1344 }
1345 
1346 /*
1347  * The upe_ctrls[] table contains the printable 'control-sequences' for the
1348  * character values 0..31 and 127.  The first entry is for value 127, thus the
1349  * entries for the remaining character values are from 1..32.
1350  */
1351 static const char *const upe_ctrls[] =
1352 {
1353 	"^?",
1354 	"^@",  "^A",  "^B",  "^C",  "^D",  "^E",  "^F",  "^G",
1355 	"^H",  "^I",  "^J",  "^K",  "^L",  "^M",  "^N",  "^O",
1356 	"^P",  "^Q",  "^R",  "^S",  "^T",  "^U",  "^V",  "^W",
1357 	"^X",  "^Y",  "^Z",  "^[",  "^\\", "^]",  "^^",  "^_"
1358 };
1359 
1360 
1361 /*
1362  * Return a printable string corresponding to the given character value.  If
1363  * the character is printable, simply return it as the string.  If it is in
1364  * the range specified by table 5-101 in the UPE, return the corresponding
1365  * string.  Otherwise, return an octal escape sequence.
1366  */
1367 static const char *
toprint(c)1368 toprint(c)
1369 wchar_t c;
1370 {
1371 	int n, len;
1372 	unsigned char *ptr;
1373 	static char mbch[MB_LEN_MAX+1];
1374 	static char buf[5 * MB_LEN_MAX + 1];
1375 
1376 	if ((n = wctomb(mbch, c)) == -1) {
1377 		/* Should never happen */
1378 		(void) sprintf(buf, "\\%x", c);
1379 		return (buf);
1380 	}
1381 	mbch[n] = '\0';
1382 	if (iswprint(c)) {
1383 		return (mbch);
1384 	} else if (c == 127) {
1385 		return (upe_ctrls[0]);
1386 	} else if (c < 32) {
1387 		/* Print as in Table 5-101 in the UPE */
1388 		return (upe_ctrls[c+1]);
1389 	} else {
1390 		/* Print as an octal escape sequence */
1391 		for (len = 0, ptr = (unsigned char *) mbch; 0 < n; --n, ++ptr)
1392 			len += sprintf(buf+len, "\\%03o", *ptr);
1393 	}
1394 	return (buf);
1395 }
1396 
1397 static int
wcoff(const wchar_t * astring,const int off)1398 wcoff(const wchar_t *astring, const int off)
1399 {
1400 	const wchar_t *s = astring;
1401 	int c = 0;
1402 	char mb[MB_LEN_MAX];
1403 
1404 	while (c < off) {
1405 		int n;
1406 		if ((n = wctomb(mb, *s)) == 0)
1407 			break;
1408 		if (n == -1)
1409 			n = 1;
1410 		c += n;
1411 		s++;
1412 	}
1413 
1414 	return (s - astring);
1415 }
1416 
1417 #define	NREGHASH	64
1418 #define	NREGHOLD	1024	/* max number unused entries */
1419 
1420 static int	nregunref;
1421 
1422 struct reghashq {
1423 	struct qelem hq;
1424 	struct regcache *regcachep;
1425 };
1426 
1427 struct regcache {
1428 	struct qelem	lq;
1429 	wchar_t	*pattern;
1430 	regex_t	re;
1431 	int	refcnt;
1432 	struct reghashq	hash;
1433 };
1434 
1435 static struct qelem reghash[NREGHASH], reglink;
1436 
1437 /*
1438  * Generate a hash value of the given wchar string.
1439  * The hashing method is similar to what Java does for strings.
1440  */
1441 static uint_t
regtxthash(const wchar_t * str)1442 regtxthash(const wchar_t *str)
1443 {
1444 	int k = 0;
1445 
1446 	while (*str != L'\0')
1447 		k = (31 * k) + *str++;
1448 
1449 	k += ~(k << 9);
1450 	k ^=  (k >> 14);
1451 	k +=  (k << 4);
1452 	k ^=  (k >> 10);
1453 
1454 	return (k % NREGHASH);
1455 }
1456 
1457 int
int_regwcomp(REGEXP * r,const wchar_t * pattern)1458 int_regwcomp(REGEXP *r, const wchar_t *pattern)
1459 {
1460 	regex_t re;
1461 	char *mbpattern;
1462 	int ret;
1463 	uint_t key;
1464 	struct qelem *qp;
1465 	struct regcache *rcp;
1466 
1467 	key = regtxthash(pattern);
1468 	for (qp = reghash[key].q_forw; qp != NULL; qp = qp->q_forw) {
1469 		rcp = ((struct reghashq *)qp)->regcachep;
1470 		if (*rcp->pattern == *pattern &&
1471 		    wcscmp(rcp->pattern, pattern) == 0)
1472 			break;
1473 	}
1474 	if (qp != NULL) {
1475 		/* update link. put this one at the beginning */
1476 		if (rcp != (struct regcache *)reglink.q_forw) {
1477 			remque(&rcp->lq);
1478 			insque(&rcp->lq, &reglink);
1479 		}
1480 		if (rcp->refcnt == 0)
1481 			nregunref--;	/* no longer unref'ed */
1482 		rcp->refcnt++;
1483 		*(struct regcache **)r = rcp;
1484 		return (REG_OK);
1485 	}
1486 
1487 	if ((mbpattern = wcstombsdup((wchar_t *)pattern)) == NULL)
1488 		return (REG_ESPACE);
1489 
1490 	ret = regcomp(&re, mbpattern, REG_EXTENDED);
1491 
1492 	free(mbpattern);
1493 
1494 	if (ret != REG_OK)
1495 		return (ret);
1496 
1497 	if ((rcp = malloc(sizeof (struct regcache))) == NULL)
1498 		return (REG_ESPACE);
1499 	rcp->re = re;
1500 	if ((rcp->pattern = wsdup(pattern)) == NULL) {
1501 		regfree(&re);
1502 		free(rcp);
1503 		return (REG_ESPACE);
1504 	}
1505 	rcp->refcnt = 1;
1506 	insque(&rcp->lq, &reglink);
1507 	insque(&rcp->hash.hq, &reghash[key]);
1508 	rcp->hash.regcachep = rcp;
1509 
1510 	*(struct regcache **)r = rcp;
1511 	return (ret);
1512 }
1513 
1514 void
int_regwfree(REGEXP r)1515 int_regwfree(REGEXP r)
1516 {
1517 	int	cnt;
1518 	struct qelem *qp, *nqp;
1519 	struct regcache *rcp;
1520 
1521 	rcp = (struct regcache *)r;
1522 
1523 	if (--rcp->refcnt != 0)
1524 		return;
1525 
1526 	/* this cache has no reference */
1527 	if (++nregunref < NREGHOLD)
1528 		return;
1529 
1530 	/*
1531 	 * We've got too much unref'ed regex. Free half of least
1532 	 * used regex.
1533 	 */
1534 	cnt = 0;
1535 	for (qp = reglink.q_forw; qp != NULL; qp = nqp) {
1536 		nqp = qp->q_forw;
1537 		rcp = (struct regcache *)qp;
1538 		if (rcp->refcnt != 0)
1539 			continue;
1540 
1541 		/* free half of them */
1542 		if (++cnt < (NREGHOLD / 2))
1543 			continue;
1544 
1545 		/* detach and free */
1546 		remque(&rcp->lq);
1547 		remque(&rcp->hash.hq);
1548 
1549 		/* free up */
1550 		free(rcp->pattern);
1551 		regfree(&rcp->re);
1552 		free(rcp);
1553 
1554 		nregunref--;
1555 	}
1556 }
1557 
1558 size_t
int_regwerror(int errcode,REGEXP r,char * errbuf,size_t bufsiz)1559 int_regwerror(int errcode, REGEXP r, char *errbuf, size_t bufsiz)
1560 {
1561 	struct regcache *rcp;
1562 
1563 	rcp = (struct regcache *)r;
1564 	return (regerror(errcode, &rcp->re, errbuf, bufsiz));
1565 }
1566 
1567 int
int_regwexec(REGEXP r,const wchar_t * astring,size_t nsub,int_regwmatch_t * sub,int flags)1568 int_regwexec(REGEXP r,		/* compiled RE */
1569 	const wchar_t *astring,	/* subject string */
1570 	size_t nsub,		/* number of subexpressions */
1571 	int_regwmatch_t *sub,	/* subexpression pointers */
1572 	int flags)
1573 {
1574 	char *mbs;
1575 	regmatch_t *mbsub = NULL;
1576 	int i;
1577 	struct regcache *rcp;
1578 
1579 	if ((mbs = wcstombsdup((wchar_t *)astring)) == NULL)
1580 		return (REG_ESPACE);
1581 
1582 	if (nsub > 0 && sub) {
1583 		if ((mbsub = malloc(nsub * sizeof (regmatch_t))) == NULL)
1584 			return (REG_ESPACE);
1585 	}
1586 
1587 	rcp = (struct regcache *)r;
1588 
1589 	i = regexec(&rcp->re, mbs, nsub, mbsub, flags);
1590 
1591 	/* Now, adjust the pointers/counts in sub */
1592 	if (i == REG_OK && nsub > 0 && mbsub) {
1593 		int j, k;
1594 
1595 		for (j = 0; j < nsub; j++) {
1596 			regmatch_t *ms = &mbsub[j];
1597 			int_regwmatch_t *ws = &sub[j];
1598 
1599 			if ((k = ms->rm_so) >= 0) {
1600 				ws->rm_so = wcoff(astring, k);
1601 				ws->rm_sp = astring + ws->rm_so;
1602 			}
1603 			if ((k = ms->rm_eo) >= 0) {
1604 				ws->rm_eo = wcoff(astring, k);
1605 				ws->rm_ep = astring + ws->rm_eo;
1606 			}
1607 		}
1608 	}
1609 
1610 	free(mbs);
1611 	if (mbsub)
1612 		free(mbsub);
1613 	return (i);
1614 }
1615 
1616 int
int_regwdosuba(REGEXP rp,const wchar_t * rpl,const wchar_t * src,wchar_t ** dstp,int len,int * globp)1617 int_regwdosuba(REGEXP rp,		/* compiled RE: Pattern */
1618 	const wchar_t *rpl,		/* replacement string: /rpl/ */
1619 	const wchar_t *src,		/* source string */
1620 	wchar_t **dstp,			/* destination string */
1621 	int len,			/* destination length */
1622 	int *globp)	/* IN: occurence, 0 for all; OUT: substitutions */
1623 {
1624 	wchar_t *dst, *odst;
1625 	const wchar_t *ip, *xp;
1626 	wchar_t *op;
1627 	int i;
1628 	wchar_t c;
1629 	int glob, iglob = *globp, oglob = 0;
1630 #define	NSUB	10
1631 	int_regwmatch_t rm[NSUB], *rmp;
1632 	int flags;
1633 	wchar_t *end;
1634 	int regerr;
1635 
1636 /* handle overflow of dst. we need "i" more bytes */
1637 #ifdef OVERFLOW
1638 #undef OVERFLOW
1639 #define	OVERFLOW(i) { \
1640 		int pos = op - dst; \
1641 		dst = (wchar_t *)realloc(odst = dst, \
1642 			(len += len + i) * sizeof (wchar_t)); \
1643 		if (dst == NULL) \
1644 			goto nospace; \
1645 		op = dst + pos; \
1646 		end = dst + len; \
1647 	}
1648 #endif
1649 
1650 	*dstp = dst = (wchar_t *)malloc(len * sizeof (wchar_t));
1651 	if (dst == NULL)
1652 		return (REG_ESPACE);
1653 
1654 	if (rp == NULL || rpl == NULL || src == NULL || dst ==  NULL)
1655 		return (REG_EFATAL);
1656 
1657 	glob = 0;	/* match count */
1658 	ip = src;	/* source position */
1659 	op = dst;	/* destination position */
1660 	end = dst + len;
1661 
1662 	flags = 0;
1663 	while ((regerr = int_regwexec(rp, ip, NSUB, rm, flags)) == REG_OK) {
1664 		/* Copy text preceding match */
1665 		if (op + (i = rm[0].rm_sp - ip) >= end)
1666 			OVERFLOW(i)
1667 		while (i--)
1668 			*op++ = *ip++;
1669 
1670 		if (iglob == 0 || ++glob == iglob) {
1671 			oglob++;
1672 			xp = rpl;		/* do substitute */
1673 		} else
1674 			xp = L"&";		/* preserve text */
1675 
1676 		/* Perform replacement of matched substing */
1677 		while ((c = *xp++) != '\0') {
1678 			rmp = NULL;
1679 			if (c == '&')
1680 				rmp = &rm[0];
1681 			else if (c == '\\') {
1682 				if ('0' <= *xp && *xp <= '9')
1683 					rmp = &rm[*xp++ - '0'];
1684 				else if (*xp != '\0')
1685 					c = *xp++;
1686 			}
1687 
1688 			if (rmp ==  NULL) {	/* Ordinary character. */
1689 				*op++ = c;
1690 				if (op >= end)
1691 					OVERFLOW(1)
1692 			} else if (rmp->rm_sp != NULL && rmp->rm_ep != NULL) {
1693 				ip = rmp->rm_sp;
1694 				if (op + (i = rmp->rm_ep - rmp->rm_sp) >= end)
1695 					OVERFLOW(i)
1696 				while (i--)
1697 					*op++ = *ip++;
1698 			}
1699 		}
1700 
1701 		ip = rm[0].rm_ep;
1702 		if (*ip == '\0')	/* If at end break */
1703 			break;
1704 		else if (rm[0].rm_sp == rm[0].rm_ep) {
1705 			/* If empty match copy next char */
1706 			*op++ = *ip++;
1707 			if (op >= end)
1708 				OVERFLOW(1)
1709 		}
1710 		flags = REG_NOTBOL;
1711 	}
1712 
1713 	if (regerr != REG_OK && regerr != REG_NOMATCH)
1714 		return (regerr);
1715 
1716 	/* Copy rest of text */
1717 	if (op + (i =  wcslen(ip)) >= end)
1718 		OVERFLOW(i)
1719 	while (i--)
1720 		*op++ = *ip++;
1721 	*op++ = '\0';
1722 
1723 	if ((*dstp = dst = (wchar_t *)realloc(odst = dst,
1724 	    sizeof (wchar_t) * (size_t)(op - dst))) == NULL) {
1725 nospace:
1726 		free(odst);
1727 		return (REG_ESPACE);
1728 	}
1729 
1730 	*globp = oglob;
1731 
1732 	return ((oglob == 0) ? REG_NOMATCH : REG_OK);
1733 }
1734