xref: /freebsd/contrib/one-true-awk/run.c (revision 3fd60a6b73ac01a72df89751f173970fae4cae73)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #define DEBUG
26 #include <stdio.h>
27 #include <ctype.h>
28 #include <errno.h>
29 #include <wctype.h>
30 #include <fcntl.h>
31 #include <setjmp.h>
32 #include <limits.h>
33 #include <math.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include <time.h>
37 #include <sys/types.h>
38 #include <sys/wait.h>
39 #include "awk.h"
40 #include "awkgram.tab.h"
41 
42 
43 static void stdinit(void);
44 static void flush_all(void);
45 static char *wide_char_to_byte_str(int rune, size_t *outlen);
46 
47 #if 1
48 #define tempfree(x)	do { if (istemp(x)) tfree(x); } while (/*CONSTCOND*/0)
49 #else
50 void tempfree(Cell *p) {
51 	if (p->ctype == OCELL && (p->csub < CUNK || p->csub > CFREE)) {
52 		WARNING("bad csub %d in Cell %d %s",
53 			p->csub, p->ctype, p->sval);
54 	}
55 	if (istemp(p))
56 		tfree(p);
57 }
58 #endif
59 
60 /* do we really need these? */
61 /* #ifdef _NFILE */
62 /* #ifndef FOPEN_MAX */
63 /* #define FOPEN_MAX _NFILE */
64 /* #endif */
65 /* #endif */
66 /*  */
67 /* #ifndef	FOPEN_MAX */
68 /* #define	FOPEN_MAX	40 */	/* max number of open files */
69 /* #endif */
70 /*  */
71 /* #ifndef RAND_MAX */
72 /* #define RAND_MAX	32767 */	/* all that ansi guarantees */
73 /* #endif */
74 
75 jmp_buf env;
76 extern	int	pairstack[];
77 extern	Awkfloat	srand_seed;
78 
79 Node	*winner = NULL;	/* root of parse tree */
80 Cell	*tmps;		/* free temporary cells for execution */
81 
82 static Cell	truecell	={ OBOOL, BTRUE, 0, 0, 1.0, NUM, NULL, NULL };
83 Cell	*True	= &truecell;
84 static Cell	falsecell	={ OBOOL, BFALSE, 0, 0, 0.0, NUM, NULL, NULL };
85 Cell	*False	= &falsecell;
86 static Cell	breakcell	={ OJUMP, JBREAK, 0, 0, 0.0, NUM, NULL, NULL };
87 Cell	*jbreak	= &breakcell;
88 static Cell	contcell	={ OJUMP, JCONT, 0, 0, 0.0, NUM, NULL, NULL };
89 Cell	*jcont	= &contcell;
90 static Cell	nextcell	={ OJUMP, JNEXT, 0, 0, 0.0, NUM, NULL, NULL };
91 Cell	*jnext	= &nextcell;
92 static Cell	nextfilecell	={ OJUMP, JNEXTFILE, 0, 0, 0.0, NUM, NULL, NULL };
93 Cell	*jnextfile	= &nextfilecell;
94 static Cell	exitcell	={ OJUMP, JEXIT, 0, 0, 0.0, NUM, NULL, NULL };
95 Cell	*jexit	= &exitcell;
96 static Cell	retcell		={ OJUMP, JRET, 0, 0, 0.0, NUM, NULL, NULL };
97 Cell	*jret	= &retcell;
98 static Cell	tempcell	={ OCELL, CTEMP, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
99 
100 Node	*curnode = NULL;	/* the node being executed, for debugging */
101 
102 /* buffer memory management */
103 int adjbuf(char **pbuf, int *psiz, int minlen, int quantum, char **pbptr,
104 	const char *whatrtn)
105 /* pbuf:    address of pointer to buffer being managed
106  * psiz:    address of buffer size variable
107  * minlen:  minimum length of buffer needed
108  * quantum: buffer size quantum
109  * pbptr:   address of movable pointer into buffer, or 0 if none
110  * whatrtn: name of the calling routine if failure should cause fatal error
111  *
112  * return   0 for realloc failure, !=0 for success
113  */
114 {
115 	if (minlen > *psiz) {
116 		char *tbuf;
117 		int rminlen = quantum ? minlen % quantum : 0;
118 		int boff = pbptr ? *pbptr - *pbuf : 0;
119 		/* round up to next multiple of quantum */
120 		if (rminlen)
121 			minlen += quantum - rminlen;
122 		tbuf = (char *) realloc(*pbuf, minlen);
123 		DPRINTF("adjbuf %s: %d %d (pbuf=%p, tbuf=%p)\n", whatrtn, *psiz, minlen, (void*)*pbuf, (void*)tbuf);
124 		if (tbuf == NULL) {
125 			if (whatrtn)
126 				FATAL("out of memory in %s", whatrtn);
127 			return 0;
128 		}
129 		*pbuf = tbuf;
130 		*psiz = minlen;
131 		if (pbptr)
132 			*pbptr = tbuf + boff;
133 	}
134 	return 1;
135 }
136 
137 void run(Node *a)	/* execution of parse tree starts here */
138 {
139 
140 	stdinit();
141 	execute(a);
142 	closeall();
143 }
144 
145 Cell *execute(Node *u)	/* execute a node of the parse tree */
146 {
147 	Cell *(*proc)(Node **, int);
148 	Cell *x;
149 	Node *a;
150 
151 	if (u == NULL)
152 		return(True);
153 	for (a = u; ; a = a->nnext) {
154 		curnode = a;
155 		if (isvalue(a)) {
156 			x = (Cell *) (a->narg[0]);
157 			if (isfld(x) && !donefld)
158 				fldbld();
159 			else if (isrec(x) && !donerec)
160 				recbld();
161 			return(x);
162 		}
163 		if (notlegal(a->nobj))	/* probably a Cell* but too risky to print */
164 			FATAL("illegal statement");
165 		proc = proctab[a->nobj-FIRSTTOKEN];
166 		x = (*proc)(a->narg, a->nobj);
167 		if (isfld(x) && !donefld)
168 			fldbld();
169 		else if (isrec(x) && !donerec)
170 			recbld();
171 		if (isexpr(a))
172 			return(x);
173 		if (isjump(x))
174 			return(x);
175 		if (a->nnext == NULL)
176 			return(x);
177 		tempfree(x);
178 	}
179 }
180 
181 
182 Cell *program(Node **a, int n)	/* execute an awk program */
183 {				/* a[0] = BEGIN, a[1] = body, a[2] = END */
184 	Cell *x;
185 
186 	if (setjmp(env) != 0)
187 		goto ex;
188 	if (a[0]) {		/* BEGIN */
189 		x = execute(a[0]);
190 		if (isexit(x))
191 			return(True);
192 		if (isjump(x))
193 			FATAL("illegal break, continue, next or nextfile from BEGIN");
194 		tempfree(x);
195 	}
196 	if (a[1] || a[2])
197 		while (getrec(&record, &recsize, true) > 0) {
198 			x = execute(a[1]);
199 			if (isexit(x))
200 				break;
201 			tempfree(x);
202 		}
203   ex:
204 	if (setjmp(env) != 0)	/* handles exit within END */
205 		goto ex1;
206 	if (a[2]) {		/* END */
207 		x = execute(a[2]);
208 		if (isbreak(x) || isnext(x) || iscont(x))
209 			FATAL("illegal break, continue, next or nextfile from END");
210 		tempfree(x);
211 	}
212   ex1:
213 	return(True);
214 }
215 
216 struct Frame {	/* stack frame for awk function calls */
217 	int nargs;	/* number of arguments in this call */
218 	Cell *fcncell;	/* pointer to Cell for function */
219 	Cell **args;	/* pointer to array of arguments after execute */
220 	Cell *retval;	/* return value */
221 };
222 
223 #define	NARGS	50	/* max args in a call */
224 
225 struct Frame *frame = NULL;	/* base of stack frames; dynamically allocated */
226 int	nframe = 0;		/* number of frames allocated */
227 struct Frame *frp = NULL;	/* frame pointer. bottom level unused */
228 
229 Cell *call(Node **a, int n)	/* function call.  very kludgy and fragile */
230 {
231 	static const Cell newcopycell = { OCELL, CCOPY, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
232 	int i, ncall, ndef;
233 	int freed = 0; /* handles potential double freeing when fcn & param share a tempcell */
234 	Node *x;
235 	Cell *args[NARGS], *oargs[NARGS];	/* BUG: fixed size arrays */
236 	Cell *y, *z, *fcn;
237 	char *s;
238 
239 	fcn = execute(a[0]);	/* the function itself */
240 	s = fcn->nval;
241 	if (!isfcn(fcn))
242 		FATAL("calling undefined function %s", s);
243 	if (frame == NULL) {
244 		frp = frame = (struct Frame *) calloc(nframe += 100, sizeof(*frame));
245 		if (frame == NULL)
246 			FATAL("out of space for stack frames calling %s", s);
247 	}
248 	for (ncall = 0, x = a[1]; x != NULL; x = x->nnext)	/* args in call */
249 		ncall++;
250 	ndef = (int) fcn->fval;			/* args in defn */
251 	DPRINTF("calling %s, %d args (%d in defn), frp=%d\n", s, ncall, ndef, (int) (frp-frame));
252 	if (ncall > ndef)
253 		WARNING("function %s called with %d args, uses only %d",
254 			s, ncall, ndef);
255 	if (ncall + ndef > NARGS)
256 		FATAL("function %s has %d arguments, limit %d", s, ncall+ndef, NARGS);
257 	for (i = 0, x = a[1]; x != NULL; i++, x = x->nnext) {	/* get call args */
258 		DPRINTF("evaluate args[%d], frp=%d:\n", i, (int) (frp-frame));
259 		y = execute(x);
260 		oargs[i] = y;
261 		DPRINTF("args[%d]: %s %f <%s>, t=%o\n",
262 			i, NN(y->nval), y->fval, isarr(y) ? "(array)" : NN(y->sval), y->tval);
263 		if (isfcn(y))
264 			FATAL("can't use function %s as argument in %s", y->nval, s);
265 		if (isarr(y))
266 			args[i] = y;	/* arrays by ref */
267 		else
268 			args[i] = copycell(y);
269 		tempfree(y);
270 	}
271 	for ( ; i < ndef; i++) {	/* add null args for ones not provided */
272 		args[i] = gettemp();
273 		*args[i] = newcopycell;
274 	}
275 	frp++;	/* now ok to up frame */
276 	if (frp >= frame + nframe) {
277 		int dfp = frp - frame;	/* old index */
278 		frame = (struct Frame *) realloc(frame, (nframe += 100) * sizeof(*frame));
279 		if (frame == NULL)
280 			FATAL("out of space for stack frames in %s", s);
281 		frp = frame + dfp;
282 	}
283 	frp->fcncell = fcn;
284 	frp->args = args;
285 	frp->nargs = ndef;	/* number defined with (excess are locals) */
286 	frp->retval = gettemp();
287 
288 	DPRINTF("start exec of %s, frp=%d\n", s, (int) (frp-frame));
289 	y = execute((Node *)(fcn->sval));	/* execute body */
290 	DPRINTF("finished exec of %s, frp=%d\n", s, (int) (frp-frame));
291 
292 	for (i = 0; i < ndef; i++) {
293 		Cell *t = frp->args[i];
294 		if (isarr(t)) {
295 			if (t->csub == CCOPY) {
296 				if (i >= ncall) {
297 					freesymtab(t);
298 					t->csub = CTEMP;
299 					tempfree(t);
300 				} else {
301 					oargs[i]->tval = t->tval;
302 					oargs[i]->tval &= ~(STR|NUM|DONTFREE);
303 					oargs[i]->sval = t->sval;
304 					tempfree(t);
305 				}
306 			}
307 		} else if (t != y) {	/* kludge to prevent freeing twice */
308 			t->csub = CTEMP;
309 			tempfree(t);
310 		} else if (t == y && t->csub == CCOPY) {
311 			t->csub = CTEMP;
312 			tempfree(t);
313 			freed = 1;
314 		}
315 	}
316 	tempfree(fcn);
317 	if (isexit(y) || isnext(y))
318 		return y;
319 	if (freed == 0) {
320 		tempfree(y);	/* don't free twice! */
321 	}
322 	z = frp->retval;			/* return value */
323 	DPRINTF("%s returns %g |%s| %o\n", s, getfval(z), getsval(z), z->tval);
324 	frp--;
325 	return(z);
326 }
327 
328 Cell *copycell(Cell *x)	/* make a copy of a cell in a temp */
329 {
330 	Cell *y;
331 
332 	/* copy is not constant or field */
333 
334 	y = gettemp();
335 	y->tval = x->tval & ~(CON|FLD|REC);
336 	y->csub = CCOPY;	/* prevents freeing until call is over */
337 	y->nval = x->nval;	/* BUG? */
338 	if (isstr(x) /* || x->ctype == OCELL */) {
339 		y->sval = tostring(x->sval);
340 		y->tval &= ~DONTFREE;
341 	} else
342 		y->tval |= DONTFREE;
343 	y->fval = x->fval;
344 	return y;
345 }
346 
347 Cell *arg(Node **a, int n)	/* nth argument of a function */
348 {
349 
350 	n = ptoi(a[0]);	/* argument number, counting from 0 */
351 	DPRINTF("arg(%d), frp->nargs=%d\n", n, frp->nargs);
352 	if (n+1 > frp->nargs)
353 		FATAL("argument #%d of function %s was not supplied",
354 			n+1, frp->fcncell->nval);
355 	return frp->args[n];
356 }
357 
358 Cell *jump(Node **a, int n)	/* break, continue, next, nextfile, return */
359 {
360 	Cell *y;
361 
362 	switch (n) {
363 	case EXIT:
364 		if (a[0] != NULL) {
365 			y = execute(a[0]);
366 			errorflag = (int) getfval(y);
367 			tempfree(y);
368 		}
369 		longjmp(env, 1);
370 	case RETURN:
371 		if (a[0] != NULL) {
372 			y = execute(a[0]);
373 			if ((y->tval & (STR|NUM)) == (STR|NUM)) {
374 				setsval(frp->retval, getsval(y));
375 				frp->retval->fval = getfval(y);
376 				frp->retval->tval |= NUM;
377 			}
378 			else if (y->tval & STR)
379 				setsval(frp->retval, getsval(y));
380 			else if (y->tval & NUM)
381 				setfval(frp->retval, getfval(y));
382 			else		/* can't happen */
383 				FATAL("bad type variable %d", y->tval);
384 			tempfree(y);
385 		}
386 		return(jret);
387 	case NEXT:
388 		return(jnext);
389 	case NEXTFILE:
390 		nextfile();
391 		return(jnextfile);
392 	case BREAK:
393 		return(jbreak);
394 	case CONTINUE:
395 		return(jcont);
396 	default:	/* can't happen */
397 		FATAL("illegal jump type %d", n);
398 	}
399 	return 0;	/* not reached */
400 }
401 
402 Cell *awkgetline(Node **a, int n)	/* get next line from specific input */
403 {		/* a[0] is variable, a[1] is operator, a[2] is filename */
404 	Cell *r, *x;
405 	extern Cell **fldtab;
406 	FILE *fp;
407 	char *buf;
408 	int bufsize = recsize;
409 	int mode;
410 	bool newflag;
411 	double result;
412 
413 	if ((buf = (char *) malloc(bufsize)) == NULL)
414 		FATAL("out of memory in getline");
415 
416 	fflush(stdout);	/* in case someone is waiting for a prompt */
417 	r = gettemp();
418 	if (a[1] != NULL) {		/* getline < file */
419 		x = execute(a[2]);		/* filename */
420 		mode = ptoi(a[1]);
421 		if (mode == '|')		/* input pipe */
422 			mode = LE;	/* arbitrary flag */
423 		fp = openfile(mode, getsval(x), &newflag);
424 		tempfree(x);
425 		if (fp == NULL)
426 			n = -1;
427 		else
428 			n = readrec(&buf, &bufsize, fp, newflag);
429 		if (n <= 0) {
430 			;
431 		} else if (a[0] != NULL) {	/* getline var <file */
432 			x = execute(a[0]);
433 			setsval(x, buf);
434 			if (is_number(x->sval, & result)) {
435 				x->fval = result;
436 				x->tval |= NUM;
437 			}
438 			tempfree(x);
439 		} else {			/* getline <file */
440 			setsval(fldtab[0], buf);
441 			if (is_number(fldtab[0]->sval, & result)) {
442 				fldtab[0]->fval = result;
443 				fldtab[0]->tval |= NUM;
444 			}
445 		}
446 	} else {			/* bare getline; use current input */
447 		if (a[0] == NULL)	/* getline */
448 			n = getrec(&record, &recsize, true);
449 		else {			/* getline var */
450 			n = getrec(&buf, &bufsize, false);
451 			if (n > 0) {
452 				x = execute(a[0]);
453 				setsval(x, buf);
454 				if (is_number(x->sval, & result)) {
455 					x->fval = result;
456 					x->tval |= NUM;
457 				}
458 				tempfree(x);
459 			}
460 		}
461 	}
462 	setfval(r, (Awkfloat) n);
463 	free(buf);
464 	return r;
465 }
466 
467 Cell *getnf(Node **a, int n)	/* get NF */
468 {
469 	if (!donefld)
470 		fldbld();
471 	return (Cell *) a[0];
472 }
473 
474 static char *
475 makearraystring(Node *p, const char *func)
476 {
477 	char *buf;
478 	int bufsz = recsize;
479 	size_t blen;
480 
481 	if ((buf = (char *) malloc(bufsz)) == NULL) {
482 		FATAL("%s: out of memory", func);
483 	}
484 
485 	blen = 0;
486 	buf[blen] = '\0';
487 
488 	for (; p; p = p->nnext) {
489 		Cell *x = execute(p);	/* expr */
490 		char *s = getsval(x);
491 		size_t seplen = strlen(getsval(subseploc));
492 		size_t nsub = p->nnext ? seplen : 0;
493 		size_t slen = strlen(s);
494 		size_t tlen = blen + slen + nsub;
495 
496 		if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
497 			FATAL("%s: out of memory %s[%s...]",
498 			    func, x->nval, buf);
499 		}
500 		memcpy(buf + blen, s, slen);
501 		if (nsub) {
502 			memcpy(buf + blen + slen, *SUBSEP, nsub);
503 		}
504 		buf[tlen] = '\0';
505 		blen = tlen;
506 		tempfree(x);
507 	}
508 	return buf;
509 }
510 
511 Cell *array(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
512 {
513 	Cell *x, *z;
514 	char *buf;
515 
516 	x = execute(a[0]);	/* Cell* for symbol table */
517 	buf = makearraystring(a[1], __func__);
518 	if (!isarr(x)) {
519 		DPRINTF("making %s into an array\n", NN(x->nval));
520 		if (freeable(x))
521 			xfree(x->sval);
522 		x->tval &= ~(STR|NUM|DONTFREE);
523 		x->tval |= ARR;
524 		x->sval = (char *) makesymtab(NSYMTAB);
525 	}
526 	z = setsymtab(buf, "", 0.0, STR|NUM, (Array *) x->sval);
527 	z->ctype = OCELL;
528 	z->csub = CVAR;
529 	tempfree(x);
530 	free(buf);
531 	return(z);
532 }
533 
534 Cell *awkdelete(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
535 {
536 	Cell *x;
537 
538 	x = execute(a[0]);	/* Cell* for symbol table */
539 	if (x == symtabloc) {
540 		FATAL("cannot delete SYMTAB or its elements");
541 	}
542 	if (!isarr(x))
543 		return True;
544 	if (a[1] == NULL) {	/* delete the elements, not the table */
545 		freesymtab(x);
546 		x->tval &= ~STR;
547 		x->tval |= ARR;
548 		x->sval = (char *) makesymtab(NSYMTAB);
549 	} else {
550 		char *buf = makearraystring(a[1], __func__);
551 		freeelem(x, buf);
552 		free(buf);
553 	}
554 	tempfree(x);
555 	return True;
556 }
557 
558 Cell *intest(Node **a, int n)	/* a[0] is index (list), a[1] is symtab */
559 {
560 	Cell *ap, *k;
561 	char *buf;
562 
563 	ap = execute(a[1]);	/* array name */
564 	if (!isarr(ap)) {
565 		DPRINTF("making %s into an array\n", ap->nval);
566 		if (freeable(ap))
567 			xfree(ap->sval);
568 		ap->tval &= ~(STR|NUM|DONTFREE);
569 		ap->tval |= ARR;
570 		ap->sval = (char *) makesymtab(NSYMTAB);
571 	}
572 	buf = makearraystring(a[0], __func__);
573 	k = lookup(buf, (Array *) ap->sval);
574 	tempfree(ap);
575 	free(buf);
576 	if (k == NULL)
577 		return(False);
578 	else
579 		return(True);
580 }
581 
582 
583 /* ======== utf-8 code ========== */
584 
585 /*
586  * Awk strings can contain ascii, random 8-bit items (eg Latin-1),
587  * or utf-8.  u8_isutf tests whether a string starts with a valid
588  * utf-8 sequence, and returns 0 if not (e.g., high bit set).
589  * u8_nextlen returns length of next valid sequence, which is
590  * 1 for ascii, 2..4 for utf-8, or 1 for high bit non-utf.
591  * u8_strlen returns length of string in valid utf-8 sequences
592  * and/or high-bit bytes.  Conversion functions go between byte
593  * number and character number.
594  *
595  * In theory, this behaves the same as before for non-utf8 bytes.
596  *
597  * Limited checking! This is a potential security hole.
598  */
599 
600 /* is s the beginning of a valid utf-8 string? */
601 /* return length 1..4 if yes, 0 if no */
602 int u8_isutf(const char *s)
603 {
604 	int n, ret;
605 	unsigned char c;
606 
607 	c = s[0];
608 	if (c < 128 || awk_mb_cur_max == 1)
609 		return 1; /* what if it's 0? */
610 
611 	n = strlen(s);
612 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
613 		ret = 2; /* 110xxxxx 10xxxxxx */
614 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
615 			 && (s[2] & 0xC0) == 0x80) {
616 		ret = 3; /* 1110xxxx 10xxxxxx 10xxxxxx */
617 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
618 			 && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
619 		ret = 4; /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
620 	} else {
621 		ret = 0;
622 	}
623 	return ret;
624 }
625 
626 /* Convert (prefix of) utf8 string to utf-32 rune. */
627 /* Sets *rune to the value, returns the length. */
628 /* No error checking: watch out. */
629 int u8_rune(int *rune, const char *s)
630 {
631 	int n, ret;
632 	unsigned char c;
633 
634 	c = s[0];
635 	if (c < 128 || awk_mb_cur_max == 1) {
636 		*rune = c;
637 		return 1;
638 	}
639 
640 	n = strlen(s);
641 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
642 		*rune = ((c & 0x1F) << 6) | (s[1] & 0x3F); /* 110xxxxx 10xxxxxx */
643 		ret = 2;
644 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
645 			  && (s[2] & 0xC0) == 0x80) {
646 		*rune = ((c & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
647 			/* 1110xxxx 10xxxxxx 10xxxxxx */
648 		ret = 3;
649 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
650 			  && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
651 		*rune = ((c & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
652 			/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
653 		ret = 4;
654 	} else {
655 		*rune = c;
656 		ret = 1;
657 	}
658 	return ret; /* returns one byte if sequence doesn't look like utf */
659 }
660 
661 /* return length of next sequence: 1 for ascii or random, 2..4 for valid utf8 */
662 int u8_nextlen(const char *s)
663 {
664 	int len;
665 
666 	len = u8_isutf(s);
667 	if (len == 0)
668 		len = 1;
669 	return len;
670 }
671 
672 /* return number of utf characters or single non-utf bytes */
673 int u8_strlen(const char *s)
674 {
675 	int i, len, n, totlen;
676 	unsigned char c;
677 
678 	n = strlen(s);
679 	totlen = 0;
680 	for (i = 0; i < n; i += len) {
681 		c = s[i];
682 		if (c < 128 || awk_mb_cur_max == 1) {
683 			len = 1;
684 		} else {
685 			len = u8_nextlen(&s[i]);
686 		}
687 		totlen++;
688 		if (i > n)
689 			FATAL("bad utf count [%s] n=%d i=%d\n", s, n, i);
690 	}
691 	return totlen;
692 }
693 
694 /* convert utf-8 char number in a string to its byte offset */
695 int u8_char2byte(const char *s, int charnum)
696 {
697 	int n;
698 	int bytenum = 0;
699 
700 	while (charnum > 0) {
701 		n = u8_nextlen(s);
702 		s += n;
703 		bytenum += n;
704 		charnum--;
705 	}
706 	return bytenum;
707 }
708 
709 /* convert byte offset in s to utf-8 char number that starts there */
710 int u8_byte2char(const char *s, int bytenum)
711 {
712 	int i, len, b;
713 	int charnum = 0; /* BUG: what origin? */
714 	/* should be 0 to match start==0 which means no match */
715 
716 	b = strlen(s);
717 	if (bytenum > b) {
718 		return -1; /* ??? */
719 	}
720 	for (i = 0; i <= bytenum; i += len) {
721 		len = u8_nextlen(s+i);
722 		charnum++;
723 	}
724 	return charnum;
725 }
726 
727 /* runetochar() adapted from rune.c in the Plan 9 distributione */
728 
729 enum
730 {
731 	Runeerror = 128, /* from somewhere else */
732 	Runemax = 0x10FFFF,
733 
734 	Bit1    = 7,
735 	Bitx    = 6,
736 	Bit2    = 5,
737 	Bit3    = 4,
738 	Bit4    = 3,
739 	Bit5    = 2,
740 
741 	T1      = ((1<<(Bit1+1))-1) ^ 0xFF,     /* 0000 0000 */
742 	Tx      = ((1<<(Bitx+1))-1) ^ 0xFF,     /* 1000 0000 */
743 	T2      = ((1<<(Bit2+1))-1) ^ 0xFF,     /* 1100 0000 */
744 	T3      = ((1<<(Bit3+1))-1) ^ 0xFF,     /* 1110 0000 */
745 	T4      = ((1<<(Bit4+1))-1) ^ 0xFF,     /* 1111 0000 */
746 	T5      = ((1<<(Bit5+1))-1) ^ 0xFF,     /* 1111 1000 */
747 
748 	Rune1   = (1<<(Bit1+0*Bitx))-1,	 	/* 0000 0000 0000 0000 0111 1111 */
749 	Rune2   = (1<<(Bit2+1*Bitx))-1,	 	/* 0000 0000 0000 0111 1111 1111 */
750 	Rune3   = (1<<(Bit3+2*Bitx))-1,	 	/* 0000 0000 1111 1111 1111 1111 */
751 	Rune4   = (1<<(Bit4+3*Bitx))-1,	 	/* 0011 1111 1111 1111 1111 1111 */
752 
753 	Maskx   = (1<<Bitx)-1,		  	/* 0011 1111 */
754 	Testx   = Maskx ^ 0xFF,		 	/* 1100 0000 */
755 
756 };
757 
758 int runetochar(char *str, int c)
759 {
760 	/* one character sequence 00000-0007F => 00-7F */
761 	if (c <= Rune1) {
762 		str[0] = c;
763 		return 1;
764 	}
765 
766 	/* two character sequence 00080-007FF => T2 Tx */
767 	if (c <= Rune2) {
768 		str[0] = T2 | (c >> 1*Bitx);
769 		str[1] = Tx | (c & Maskx);
770 		return 2;
771 	}
772 
773 	/* three character sequence 00800-0FFFF => T3 Tx Tx */
774 	if (c > Runemax)
775 		c = Runeerror;
776 	if (c <= Rune3) {
777 		str[0] = T3 |  (c >> 2*Bitx);
778 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
779 		str[2] = Tx |  (c & Maskx);
780 		return 3;
781 	}
782 
783 	/* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */
784 	str[0] = T4 |  (c >> 3*Bitx);
785 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
786 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
787 	str[3] = Tx |  (c & Maskx);
788 	return 4;
789 }
790 
791 
792 /* ========== end of utf8 code =========== */
793 
794 
795 
796 Cell *matchop(Node **a, int n)	/* ~ and match() */
797 {
798 	Cell *x, *y;
799 	char *s, *t;
800 	int i;
801 	int cstart, cpatlen, len;
802 	fa *pfa;
803 	int (*mf)(fa *, const char *) = match, mode = 0;
804 
805 	if (n == MATCHFCN) {
806 		mf = pmatch;
807 		mode = 1;
808 	}
809 	x = execute(a[1]);	/* a[1] = target text */
810 	s = getsval(x);
811 	if (a[0] == NULL)	/* a[1] == 0: already-compiled reg expr */
812 		i = (*mf)((fa *) a[2], s);
813 	else {
814 		y = execute(a[2]);	/* a[2] = regular expr */
815 		t = getsval(y);
816 		pfa = makedfa(t, mode);
817 		i = (*mf)(pfa, s);
818 		tempfree(y);
819 	}
820 	tempfree(x);
821 	if (n == MATCHFCN) {
822 		int start = patbeg - s + 1; /* origin 1 */
823 		if (patlen < 0) {
824 			start = 0; /* not found */
825 		} else {
826 			cstart = u8_byte2char(s, start-1);
827 			cpatlen = 0;
828 			for (i = 0; i < patlen; i += len) {
829 				len = u8_nextlen(patbeg+i);
830 				cpatlen++;
831 			}
832 
833 			start = cstart;
834 			patlen = cpatlen;
835 		}
836 
837 		setfval(rstartloc, (Awkfloat) start);
838 		setfval(rlengthloc, (Awkfloat) patlen);
839 		x = gettemp();
840 		x->tval = NUM;
841 		x->fval = start;
842 		return x;
843 	} else if ((n == MATCH && i == 1) || (n == NOTMATCH && i == 0))
844 		return(True);
845 	else
846 		return(False);
847 }
848 
849 
850 Cell *boolop(Node **a, int n)	/* a[0] || a[1], a[0] && a[1], !a[0] */
851 {
852 	Cell *x, *y;
853 	int i;
854 
855 	x = execute(a[0]);
856 	i = istrue(x);
857 	tempfree(x);
858 	switch (n) {
859 	case BOR:
860 		if (i) return(True);
861 		y = execute(a[1]);
862 		i = istrue(y);
863 		tempfree(y);
864 		if (i) return(True);
865 		else return(False);
866 	case AND:
867 		if ( !i ) return(False);
868 		y = execute(a[1]);
869 		i = istrue(y);
870 		tempfree(y);
871 		if (i) return(True);
872 		else return(False);
873 	case NOT:
874 		if (i) return(False);
875 		else return(True);
876 	default:	/* can't happen */
877 		FATAL("unknown boolean operator %d", n);
878 	}
879 	return 0;	/*NOTREACHED*/
880 }
881 
882 Cell *relop(Node **a, int n)	/* a[0 < a[1], etc. */
883 {
884 	int i;
885 	Cell *x, *y;
886 	Awkfloat j;
887 	bool x_is_nan, y_is_nan;
888 
889 	x = execute(a[0]);
890 	y = execute(a[1]);
891 	x_is_nan = isnan(x->fval);
892 	y_is_nan = isnan(y->fval);
893 	if (x->tval&NUM && y->tval&NUM) {
894 		if ((x_is_nan || y_is_nan) && n != NE)
895 			return(False);
896 		j = x->fval - y->fval;
897 		i = j<0? -1: (j>0? 1: 0);
898 	} else {
899 		i = strcmp(getsval(x), getsval(y));
900 	}
901 	tempfree(x);
902 	tempfree(y);
903 	switch (n) {
904 	case LT:	if (i<0) return(True);
905 			else return(False);
906 	case LE:	if (i<=0) return(True);
907 			else return(False);
908 	case NE:	if (x_is_nan && y_is_nan) return(True);
909 			else if (i!=0) return(True);
910 			else return(False);
911 	case EQ:	if (i == 0) return(True);
912 			else return(False);
913 	case GE:	if (i>=0) return(True);
914 			else return(False);
915 	case GT:	if (i>0) return(True);
916 			else return(False);
917 	default:	/* can't happen */
918 		FATAL("unknown relational operator %d", n);
919 	}
920 	return 0;	/*NOTREACHED*/
921 }
922 
923 void tfree(Cell *a)	/* free a tempcell */
924 {
925 	if (freeable(a)) {
926 		DPRINTF("freeing %s %s %o\n", NN(a->nval), NN(a->sval), a->tval);
927 		xfree(a->sval);
928 	}
929 	if (a == tmps)
930 		FATAL("tempcell list is curdled");
931 	a->cnext = tmps;
932 	tmps = a;
933 }
934 
935 Cell *gettemp(void)	/* get a tempcell */
936 {	int i;
937 	Cell *x;
938 
939 	if (!tmps) {
940 		tmps = (Cell *) calloc(100, sizeof(*tmps));
941 		if (!tmps)
942 			FATAL("out of space for temporaries");
943 		for (i = 1; i < 100; i++)
944 			tmps[i-1].cnext = &tmps[i];
945 		tmps[i-1].cnext = NULL;
946 	}
947 	x = tmps;
948 	tmps = x->cnext;
949 	*x = tempcell;
950 	return(x);
951 }
952 
953 Cell *indirect(Node **a, int n)	/* $( a[0] ) */
954 {
955 	Awkfloat val;
956 	Cell *x;
957 	int m;
958 	char *s;
959 
960 	x = execute(a[0]);
961 	val = getfval(x);	/* freebsd: defend against super large field numbers */
962 	if ((Awkfloat)INT_MAX < val)
963 		FATAL("trying to access out of range field %s", x->nval);
964 	m = (int) val;
965 	if (m == 0 && !is_number(s = getsval(x), NULL))	/* suspicion! */
966 		FATAL("illegal field $(%s), name \"%s\"", s, x->nval);
967 		/* BUG: can x->nval ever be null??? */
968 	tempfree(x);
969 	x = fieldadr(m);
970 	x->ctype = OCELL;	/* BUG?  why are these needed? */
971 	x->csub = CFLD;
972 	return(x);
973 }
974 
975 Cell *substr(Node **a, int nnn)		/* substr(a[0], a[1], a[2]) */
976 {
977 	int k, m, n;
978 	int mb, nb;
979 	char *s;
980 	int temp;
981 	Cell *x, *y, *z = NULL;
982 
983 	x = execute(a[0]);
984 	y = execute(a[1]);
985 	if (a[2] != NULL)
986 		z = execute(a[2]);
987 	s = getsval(x);
988 	k = u8_strlen(s) + 1;
989 	if (k <= 1) {
990 		tempfree(x);
991 		tempfree(y);
992 		if (a[2] != NULL) {
993 			tempfree(z);
994 		}
995 		x = gettemp();
996 		setsval(x, "");
997 		return(x);
998 	}
999 	m = (int) getfval(y);
1000 	if (m <= 0)
1001 		m = 1;
1002 	else if (m > k)
1003 		m = k;
1004 	tempfree(y);
1005 	if (a[2] != NULL) {
1006 		n = (int) getfval(z);
1007 		tempfree(z);
1008 	} else
1009 		n = k - 1;
1010 	if (n < 0)
1011 		n = 0;
1012 	else if (n > k - m)
1013 		n = k - m;
1014 	/* m is start, n is length from there */
1015 	DPRINTF("substr: m=%d, n=%d, s=%s\n", m, n, s);
1016 	y = gettemp();
1017 	mb = u8_char2byte(s, m-1); /* byte offset of start char in s */
1018 	nb = u8_char2byte(s, m-1+n);  /* byte offset of end+1 char in s */
1019 
1020 	temp = s[nb];	/* with thanks to John Linderman */
1021 	s[nb] = '\0';
1022 	setsval(y, s + mb);
1023 	s[nb] = temp;
1024 	tempfree(x);
1025 	return(y);
1026 }
1027 
1028 Cell *sindex(Node **a, int nnn)		/* index(a[0], a[1]) */
1029 {
1030 	Cell *x, *y, *z;
1031 	char *s1, *s2, *p1, *p2, *q;
1032 	Awkfloat v = 0.0;
1033 
1034 	x = execute(a[0]);
1035 	s1 = getsval(x);
1036 	y = execute(a[1]);
1037 	s2 = getsval(y);
1038 
1039 	z = gettemp();
1040 	for (p1 = s1; *p1 != '\0'; p1++) {
1041 		for (q = p1, p2 = s2; *p2 != '\0' && *q == *p2; q++, p2++)
1042 			continue;
1043 		if (*p2 == '\0') {
1044 			/* v = (Awkfloat) (p1 - s1 + 1);	 origin 1 */
1045 
1046 		   /* should be a function: used in match() as well */
1047 			int i, len;
1048 			v = 0;
1049 			for (i = 0; i < p1-s1+1; i += len) {
1050 				len = u8_nextlen(s1+i);
1051 				v++;
1052 			}
1053 			break;
1054 		}
1055 	}
1056 	tempfree(x);
1057 	tempfree(y);
1058 	setfval(z, v);
1059 	return(z);
1060 }
1061 
1062 int has_utf8(char *s)	/* return 1 if s contains any utf-8 (2 bytes or more) character */
1063 {
1064 	int n;
1065 
1066 	for (n = 0; *s != 0; s += n) {
1067 		n = u8_nextlen(s);
1068 		if (n > 1)
1069 			return 1;
1070 	}
1071 	return 0;
1072 }
1073 
1074 #define	MAXNUMSIZE	50
1075 
1076 int format(char **pbuf, int *pbufsize, const char *s, Node *a)	/* printf-like conversions */
1077 {
1078 	char *fmt;
1079 	char *p, *t;
1080 	const char *os;
1081 	Cell *x;
1082 	int flag = 0, n;
1083 	int fmtwd; /* format width */
1084 	int fmtsz = recsize;
1085 	char *buf = *pbuf;
1086 	int bufsize = *pbufsize;
1087 #define FMTSZ(a)   (fmtsz - ((a) - fmt))
1088 #define BUFSZ(a)   (bufsize - ((a) - buf))
1089 
1090 	static bool first = true;
1091 	static bool have_a_format = false;
1092 
1093 	if (first) {
1094 		char xbuf[100];
1095 
1096 		snprintf(xbuf, sizeof(xbuf), "%a", 42.0);
1097 		have_a_format = (strcmp(xbuf, "0x1.5p+5") == 0);
1098 		first = false;
1099 	}
1100 
1101 	os = s;
1102 	p = buf;
1103 	if ((fmt = (char *) malloc(fmtsz)) == NULL)
1104 		FATAL("out of memory in format()");
1105 	while (*s) {
1106 		adjbuf(&buf, &bufsize, MAXNUMSIZE+1+p-buf, recsize, &p, "format1");
1107 		if (*s != '%') {
1108 			*p++ = *s++;
1109 			continue;
1110 		}
1111 		if (*(s+1) == '%') {
1112 			*p++ = '%';
1113 			s += 2;
1114 			continue;
1115 		}
1116 		fmtwd = atoi(s+1);
1117 		if (fmtwd < 0)
1118 			fmtwd = -fmtwd;
1119 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format2");
1120 		for (t = fmt; (*t++ = *s) != '\0'; s++) {
1121 			if (!adjbuf(&fmt, &fmtsz, MAXNUMSIZE+1+t-fmt, recsize, &t, "format3"))
1122 				FATAL("format item %.30s... ran format() out of memory", os);
1123 			/* Ignore size specifiers */
1124 			if (strchr("hjLlqtz", *s) != NULL) {	/* the ansi panoply */
1125 				t--;
1126 				continue;
1127 			}
1128 			if (isalpha((uschar)*s))
1129 				break;
1130 			if (*s == '$') {
1131 				FATAL("'$' not permitted in awk formats");
1132 			}
1133 			if (*s == '*') {
1134 				if (a == NULL) {
1135 					FATAL("not enough args in printf(%s)", os);
1136 				}
1137 				x = execute(a);
1138 				a = a->nnext;
1139 				snprintf(t - 1, FMTSZ(t - 1),
1140 				    "%d", fmtwd=(int) getfval(x));
1141 				if (fmtwd < 0)
1142 					fmtwd = -fmtwd;
1143 				adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format");
1144 				t = fmt + strlen(fmt);
1145 				tempfree(x);
1146 			}
1147 		}
1148 		*t = '\0';
1149 		if (fmtwd < 0)
1150 			fmtwd = -fmtwd;
1151 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format4");
1152 		switch (*s) {
1153 		case 'a': case 'A':
1154 			if (have_a_format)
1155 				flag = *s;
1156 			else
1157 				flag = 'f';
1158 			break;
1159 		case 'f': case 'e': case 'g': case 'E': case 'G':
1160 			flag = 'f';
1161 			break;
1162 		case 'd': case 'i': case 'o': case 'x': case 'X': case 'u':
1163 			flag = (*s == 'd' || *s == 'i') ? 'd' : 'u';
1164 			*(t-1) = 'j';
1165 			*t = *s;
1166 			*++t = '\0';
1167 			break;
1168 		case 's':
1169 			flag = 's';
1170 			break;
1171 		case 'c':
1172 			flag = 'c';
1173 			break;
1174 		case '\0':
1175 			FATAL("missing printf conversion specifier");
1176 			break;
1177 		default:
1178 			WARNING("weird printf conversion %s", fmt);
1179 			flag = '?';
1180 			break;
1181 		}
1182 		if (a == NULL)
1183 			FATAL("not enough args in printf(%s)", os);
1184 		x = execute(a);
1185 		a = a->nnext;
1186 		n = MAXNUMSIZE;
1187 		if (fmtwd > n)
1188 			n = fmtwd;
1189 		adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format5");
1190 		switch (flag) {
1191 		case '?':
1192 			snprintf(p, BUFSZ(p), "%s", fmt);	/* unknown, so dump it too */
1193 			t = getsval(x);
1194 			n = strlen(t);
1195 			if (fmtwd > n)
1196 				n = fmtwd;
1197 			adjbuf(&buf, &bufsize, 1+strlen(p)+n+p-buf, recsize, &p, "format6");
1198 			p += strlen(p);
1199 			snprintf(p, BUFSZ(p), "%s", t);
1200 			break;
1201 		case 'a':
1202 		case 'A':
1203 		case 'f':	snprintf(p, BUFSZ(p), fmt, getfval(x)); break;
1204 		case 'd':	snprintf(p, BUFSZ(p), fmt, (intmax_t) getfval(x)); break;
1205 		case 'u':	snprintf(p, BUFSZ(p), fmt, (uintmax_t) getfval(x)); break;
1206 
1207 		case 's': {
1208 			t = getsval(x);
1209 			n = strlen(t);
1210 			/* if simple format or no utf-8 in the string, sprintf works */
1211 			if (!has_utf8(t) || strcmp(fmt,"%s") == 0) {
1212 				if (fmtwd > n)
1213 					n = fmtwd;
1214 				if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7"))
1215 					FATAL("huge string/format (%d chars) in printf %.30s..." \
1216 						" ran format() out of memory", n, t);
1217 				snprintf(p, BUFSZ(p), fmt, t);
1218 				break;
1219 			}
1220 
1221 			/* get here if string has utf-8 chars and fmt is not plain %s */
1222 			/* "%-w.ps", where -, w and .p are all optional */
1223 			/* '0' before the w is a flag character */
1224 			/* fmt points at % */
1225 			int ljust = 0, wid = 0, prec = n, pad = 0;
1226 			char *f = fmt+1;
1227 			if (f[0] == '-') {
1228 				ljust = 1;
1229 				f++;
1230 			}
1231 			// flags '0' and '+' are recognized but skipped
1232 			if (f[0] == '0') {
1233 				f++;
1234 				if (f[0] == '+')
1235 					f++;
1236 			}
1237 			if (f[0] == '+') {
1238 				f++;
1239 				if (f[0] == '0')
1240 					f++;
1241 			}
1242 			if (isdigit(f[0])) { /* there is a wid */
1243 				wid = strtol(f, &f, 10);
1244 			}
1245 			if (f[0] == '.') { /* there is a .prec */
1246 				prec = strtol(++f, &f, 10);
1247 			}
1248 			if (prec > u8_strlen(t))
1249 				prec = u8_strlen(t);
1250 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1251 			int i, k, n;
1252 
1253 			if (ljust) { // print prec chars from t, then pad blanks
1254 				n = u8_char2byte(t, prec);
1255 				for (k = 0; k < n; k++) {
1256 					//putchar(t[k]);
1257 					*p++ = t[k];
1258 				}
1259 				for (i = 0; i < pad; i++) {
1260 					//printf(" ");
1261 					*p++ = ' ';
1262 				}
1263 			} else { // print pad blanks, then prec chars from t
1264 				for (i = 0; i < pad; i++) {
1265 					//printf(" ");
1266 					*p++ = ' ';
1267 				}
1268 				n = u8_char2byte(t, prec);
1269 				for (k = 0; k < n; k++) {
1270 					//putchar(t[k]);
1271 					*p++ = t[k];
1272 				}
1273 			}
1274 			*p = 0;
1275 			break;
1276 		}
1277 
1278                case 'c': {
1279 			/*
1280 			 * If a numeric value is given, awk should just turn
1281 			 * it into a character and print it:
1282 			 *      BEGIN { printf("%c\n", 65) }
1283 			 * prints "A".
1284 			 *
1285 			 * But what if the numeric value is > 128 and
1286 			 * represents a valid Unicode code point?!? We do
1287 			 * our best to convert it back into UTF-8. If we
1288 			 * can't, we output the encoding of the Unicode
1289 			 * "invalid character", 0xFFFD.
1290 			 */
1291 			if (isnum(x)) {
1292 				int charval = (int) getfval(x);
1293 
1294 				if (charval != 0) {
1295 					if (charval < 128 || awk_mb_cur_max == 1)
1296 						snprintf(p, BUFSZ(p), fmt, charval);
1297 					else {
1298 						// possible unicode character
1299 						size_t count;
1300 						char *bs = wide_char_to_byte_str(charval, &count);
1301 
1302 						if (bs == NULL)	{ // invalid character
1303 							// use unicode invalid character, 0xFFFD
1304 							bs = "\357\277\275";
1305 							count = 3;
1306 						}
1307 						t = bs;
1308 						n = count;
1309 						goto format_percent_c;
1310 					}
1311 				} else {
1312 					*p++ = '\0'; /* explicit null byte */
1313 					*p = '\0';   /* next output will start here */
1314 				}
1315 				break;
1316 			}
1317 			t = getsval(x);
1318 			n = u8_nextlen(t);
1319 		format_percent_c:
1320 			if (n < 2) { /* not utf8 */
1321 				snprintf(p, BUFSZ(p), fmt, getsval(x)[0]);
1322 				break;
1323 			}
1324 
1325 			// utf8 character, almost same song and dance as for %s
1326 			int ljust = 0, wid = 0, prec = n, pad = 0;
1327 			char *f = fmt+1;
1328 			if (f[0] == '-') {
1329 				ljust = 1;
1330 				f++;
1331 			}
1332 			// flags '0' and '+' are recognized but skipped
1333 			if (f[0] == '0') {
1334 				f++;
1335 				if (f[0] == '+')
1336 					f++;
1337 			}
1338 			if (f[0] == '+') {
1339 				f++;
1340 				if (f[0] == '0')
1341 					f++;
1342 			}
1343 			if (isdigit(f[0])) { /* there is a wid */
1344 				wid = strtol(f, &f, 10);
1345 			}
1346 			if (f[0] == '.') { /* there is a .prec */
1347 				prec = strtol(++f, &f, 10);
1348 			}
1349 			if (prec > 1)           // %c --> only one character
1350 				prec = 1;
1351 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1352 			int i;
1353 
1354 			if (ljust) { // print one char from t, then pad blanks
1355 				for (i = 0; i < n; i++)
1356 					*p++ = t[i];
1357 				for (i = 0; i < pad; i++) {
1358 					//printf(" ");
1359 					*p++ = ' ';
1360 				}
1361 			} else { // print pad blanks, then prec chars from t
1362 				for (i = 0; i < pad; i++) {
1363 					//printf(" ");
1364 					*p++ = ' ';
1365 				}
1366 				for (i = 0; i < n; i++)
1367 					*p++ = t[i];
1368 			}
1369 			*p = 0;
1370 			break;
1371 		}
1372 		default:
1373 			FATAL("can't happen: bad conversion %c in format()", flag);
1374 		}
1375 
1376 		tempfree(x);
1377 		p += strlen(p);
1378 		s++;
1379 	}
1380 	*p = '\0';
1381 	free(fmt);
1382 	for ( ; a; a = a->nnext) {		/* evaluate any remaining args */
1383 		x = execute(a);
1384 		tempfree(x);
1385 	}
1386 	*pbuf = buf;
1387 	*pbufsize = bufsize;
1388 	return p - buf;
1389 }
1390 
1391 Cell *awksprintf(Node **a, int n)		/* sprintf(a[0]) */
1392 {
1393 	Cell *x;
1394 	Node *y;
1395 	char *buf;
1396 	int bufsz=3*recsize;
1397 
1398 	if ((buf = (char *) malloc(bufsz)) == NULL)
1399 		FATAL("out of memory in awksprintf");
1400 	y = a[0]->nnext;
1401 	x = execute(a[0]);
1402 	if (format(&buf, &bufsz, getsval(x), y) == -1)
1403 		FATAL("sprintf string %.30s... too long.  can't happen.", buf);
1404 	tempfree(x);
1405 	x = gettemp();
1406 	x->sval = buf;
1407 	x->tval = STR;
1408 	return(x);
1409 }
1410 
1411 Cell *awkprintf(Node **a, int n)		/* printf */
1412 {	/* a[0] is list of args, starting with format string */
1413 	/* a[1] is redirection operator, a[2] is redirection file */
1414 	FILE *fp;
1415 	Cell *x;
1416 	Node *y;
1417 	char *buf;
1418 	int len;
1419 	int bufsz=3*recsize;
1420 
1421 	if ((buf = (char *) malloc(bufsz)) == NULL)
1422 		FATAL("out of memory in awkprintf");
1423 	y = a[0]->nnext;
1424 	x = execute(a[0]);
1425 	if ((len = format(&buf, &bufsz, getsval(x), y)) == -1)
1426 		FATAL("printf string %.30s... too long.  can't happen.", buf);
1427 	tempfree(x);
1428 	if (a[1] == NULL) {
1429 		/* fputs(buf, stdout); */
1430 		fwrite(buf, len, 1, stdout);
1431 		if (ferror(stdout))
1432 			FATAL("write error on stdout");
1433 	} else {
1434 		fp = redirect(ptoi(a[1]), a[2]);
1435 		/* fputs(buf, fp); */
1436 		fwrite(buf, len, 1, fp);
1437 		fflush(fp);
1438 		if (ferror(fp))
1439 			FATAL("write error on %s", filename(fp));
1440 	}
1441 	free(buf);
1442 	return(True);
1443 }
1444 
1445 Cell *arith(Node **a, int n)	/* a[0] + a[1], etc.  also -a[0] */
1446 {
1447 	Awkfloat i, j = 0;
1448 	double v;
1449 	Cell *x, *y, *z;
1450 
1451 	x = execute(a[0]);
1452 	i = getfval(x);
1453 	tempfree(x);
1454 	if (n != UMINUS && n != UPLUS) {
1455 		y = execute(a[1]);
1456 		j = getfval(y);
1457 		tempfree(y);
1458 	}
1459 	z = gettemp();
1460 	switch (n) {
1461 	case ADD:
1462 		i += j;
1463 		break;
1464 	case MINUS:
1465 		i -= j;
1466 		break;
1467 	case MULT:
1468 		i *= j;
1469 		break;
1470 	case DIVIDE:
1471 		if (j == 0)
1472 			FATAL("division by zero");
1473 		i /= j;
1474 		break;
1475 	case MOD:
1476 		if (j == 0)
1477 			FATAL("division by zero in mod");
1478 		modf(i/j, &v);
1479 		i = i - j * v;
1480 		break;
1481 	case UMINUS:
1482 		i = -i;
1483 		break;
1484 	case UPLUS: /* handled by getfval(), above */
1485 		break;
1486 	case POWER:
1487 		if (j >= 0 && modf(j, &v) == 0.0)	/* pos integer exponent */
1488 			i = ipow(i, (int) j);
1489                else {
1490 			errno = 0;
1491 			i = errcheck(pow(i, j), "pow");
1492                }
1493 		break;
1494 	default:	/* can't happen */
1495 		FATAL("illegal arithmetic operator %d", n);
1496 	}
1497 	setfval(z, i);
1498 	return(z);
1499 }
1500 
1501 double ipow(double x, int n)	/* x**n.  ought to be done by pow, but isn't always */
1502 {
1503 	double v;
1504 
1505 	if (n <= 0)
1506 		return 1;
1507 	v = ipow(x, n/2);
1508 	if (n % 2 == 0)
1509 		return v * v;
1510 	else
1511 		return x * v * v;
1512 }
1513 
1514 Cell *incrdecr(Node **a, int n)		/* a[0]++, etc. */
1515 {
1516 	Cell *x, *z;
1517 	int k;
1518 	Awkfloat xf;
1519 
1520 	x = execute(a[0]);
1521 	xf = getfval(x);
1522 	k = (n == PREINCR || n == POSTINCR) ? 1 : -1;
1523 	if (n == PREINCR || n == PREDECR) {
1524 		setfval(x, xf + k);
1525 		return(x);
1526 	}
1527 	z = gettemp();
1528 	setfval(z, xf);
1529 	setfval(x, xf + k);
1530 	tempfree(x);
1531 	return(z);
1532 }
1533 
1534 Cell *assign(Node **a, int n)	/* a[0] = a[1], a[0] += a[1], etc. */
1535 {		/* this is subtle; don't muck with it. */
1536 	Cell *x, *y;
1537 	Awkfloat xf, yf;
1538 	double v;
1539 
1540 	y = execute(a[1]);
1541 	x = execute(a[0]);
1542 	if (n == ASSIGN) {	/* ordinary assignment */
1543 		if (x == y && !(x->tval & (FLD|REC)) && x != nfloc)
1544 			;	/* self-assignment: leave alone unless it's a field or NF */
1545 		else if ((y->tval & (STR|NUM)) == (STR|NUM)) {
1546 			setsval(x, getsval(y));
1547 			x->fval = getfval(y);
1548 			x->tval |= NUM;
1549 		}
1550 		else if (isstr(y))
1551 			setsval(x, getsval(y));
1552 		else if (isnum(y))
1553 			setfval(x, getfval(y));
1554 		else
1555 			funnyvar(y, "read value of");
1556 		tempfree(y);
1557 		return(x);
1558 	}
1559 	xf = getfval(x);
1560 	yf = getfval(y);
1561 	switch (n) {
1562 	case ADDEQ:
1563 		xf += yf;
1564 		break;
1565 	case SUBEQ:
1566 		xf -= yf;
1567 		break;
1568 	case MULTEQ:
1569 		xf *= yf;
1570 		break;
1571 	case DIVEQ:
1572 		if (yf == 0)
1573 			FATAL("division by zero in /=");
1574 		xf /= yf;
1575 		break;
1576 	case MODEQ:
1577 		if (yf == 0)
1578 			FATAL("division by zero in %%=");
1579 		modf(xf/yf, &v);
1580 		xf = xf - yf * v;
1581 		break;
1582 	case POWEQ:
1583 		if (yf >= 0 && modf(yf, &v) == 0.0)	/* pos integer exponent */
1584 			xf = ipow(xf, (int) yf);
1585                else {
1586 			errno = 0;
1587 			xf = errcheck(pow(xf, yf), "pow");
1588                }
1589 		break;
1590 	default:
1591 		FATAL("illegal assignment operator %d", n);
1592 		break;
1593 	}
1594 	tempfree(y);
1595 	setfval(x, xf);
1596 	return(x);
1597 }
1598 
1599 Cell *cat(Node **a, int q)	/* a[0] cat a[1] */
1600 {
1601 	Cell *x, *y, *z;
1602 	int n1, n2;
1603 	char *s = NULL;
1604 	int ssz = 0;
1605 
1606 	x = execute(a[0]);
1607 	n1 = strlen(getsval(x));
1608 	adjbuf(&s, &ssz, n1 + 1, recsize, 0, "cat1");
1609 	memcpy(s, x->sval, n1);
1610 
1611 	tempfree(x);
1612 
1613 	y = execute(a[1]);
1614 	n2 = strlen(getsval(y));
1615 	adjbuf(&s, &ssz, n1 + n2 + 1, recsize, 0, "cat2");
1616 	memcpy(s + n1, y->sval, n2);
1617 	s[n1 + n2] = '\0';
1618 
1619 	tempfree(y);
1620 
1621 	z = gettemp();
1622 	z->sval = s;
1623 	z->tval = STR;
1624 
1625 	return(z);
1626 }
1627 
1628 Cell *pastat(Node **a, int n)	/* a[0] { a[1] } */
1629 {
1630 	Cell *x;
1631 
1632 	if (a[0] == NULL)
1633 		x = execute(a[1]);
1634 	else {
1635 		x = execute(a[0]);
1636 		if (istrue(x)) {
1637 			tempfree(x);
1638 			x = execute(a[1]);
1639 		}
1640 	}
1641 	return x;
1642 }
1643 
1644 Cell *dopa2(Node **a, int n)	/* a[0], a[1] { a[2] } */
1645 {
1646 	Cell *x;
1647 	int pair;
1648 
1649 	pair = ptoi(a[3]);
1650 	if (pairstack[pair] == 0) {
1651 		x = execute(a[0]);
1652 		if (istrue(x))
1653 			pairstack[pair] = 1;
1654 		tempfree(x);
1655 	}
1656 	if (pairstack[pair] == 1) {
1657 		x = execute(a[1]);
1658 		if (istrue(x))
1659 			pairstack[pair] = 0;
1660 		tempfree(x);
1661 		x = execute(a[2]);
1662 		return(x);
1663 	}
1664 	return(False);
1665 }
1666 
1667 Cell *split(Node **a, int nnn)	/* split(a[0], a[1], a[2]); a[3] is type */
1668 {
1669 	Cell *x = NULL, *y, *ap;
1670 	const char *s, *origs, *t;
1671 	const char *fs = NULL;
1672 	char *origfs = NULL;
1673 	int sep;
1674 	char temp, num[50];
1675 	int n, tempstat, arg3type;
1676 	int j;
1677 	double result;
1678 
1679 	y = execute(a[0]);	/* source string */
1680 	origs = s = strdup(getsval(y));
1681 	tempfree(y);
1682 	arg3type = ptoi(a[3]);
1683 	if (a[2] == NULL) {		/* BUG: CSV should override implicit fs but not explicit */
1684 		fs = getsval(fsloc);
1685 	} else if (arg3type == STRING) {	/* split(str,arr,"string") */
1686 		x = execute(a[2]);
1687 		fs = origfs = strdup(getsval(x));
1688 		tempfree(x);
1689 	} else if (arg3type == REGEXPR) {
1690 		fs = "(regexpr)";	/* split(str,arr,/regexpr/) */
1691 	} else {
1692 		FATAL("illegal type of split");
1693 	}
1694 	sep = *fs;
1695 	ap = execute(a[1]);	/* array name */
1696 /* BUG 7/26/22: this appears not to reset array: see C1/asplit */
1697 	freesymtab(ap);
1698 	DPRINTF("split: s=|%s|, a=%s, sep=|%s|\n", s, NN(ap->nval), fs);
1699 	ap->tval &= ~STR;
1700 	ap->tval |= ARR;
1701 	ap->sval = (char *) makesymtab(NSYMTAB);
1702 
1703 	n = 0;
1704         if (arg3type == REGEXPR && strlen((char*)((fa*)a[2])->restr) == 0) {
1705 		/* split(s, a, //); have to arrange that it looks like empty sep */
1706 		arg3type = 0;
1707 		fs = "";
1708 		sep = 0;
1709 	}
1710 	if (*s != '\0' && (strlen(fs) > 1 || arg3type == REGEXPR)) {	/* reg expr */
1711 		fa *pfa;
1712 		if (arg3type == REGEXPR) {	/* it's ready already */
1713 			pfa = (fa *) a[2];
1714 		} else {
1715 			pfa = makedfa(fs, 1);
1716 		}
1717 		if (nematch(pfa,s)) {
1718 			tempstat = pfa->initstat;
1719 			pfa->initstat = 2;
1720 			do {
1721 				n++;
1722 				snprintf(num, sizeof(num), "%d", n);
1723 				temp = *patbeg;
1724 				setptr(patbeg, '\0');
1725 				if (is_number(s, & result))
1726 					setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1727 				else
1728 					setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1729 				setptr(patbeg, temp);
1730 				s = patbeg + patlen;
1731 				if (*(patbeg+patlen-1) == '\0' || *s == '\0') {
1732 					n++;
1733 					snprintf(num, sizeof(num), "%d", n);
1734 					setsymtab(num, "", 0.0, STR, (Array *) ap->sval);
1735 					pfa->initstat = tempstat;
1736 					goto spdone;
1737 				}
1738 			} while (nematch(pfa,s));
1739 			pfa->initstat = tempstat; 	/* bwk: has to be here to reset */
1740 							/* cf gsub and refldbld */
1741 		}
1742 		n++;
1743 		snprintf(num, sizeof(num), "%d", n);
1744 		if (is_number(s, & result))
1745 			setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1746 		else
1747 			setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1748   spdone:
1749 		pfa = NULL;
1750 
1751 	} else if (a[2] == NULL && CSV) {	/* CSV only if no explicit separator */
1752 		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
1753 		for (;;) {
1754 			char *fr = newt;
1755 			n++;
1756 			if (*s == '"' ) { /* start of "..." */
1757 				for (s++ ; *s != '\0'; ) {
1758 					if (*s == '"' && s[1] != '\0' && s[1] == '"') {
1759 						s += 2; /* doubled quote */
1760 						*fr++ = '"';
1761 					} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {
1762 						s++; /* skip over closing quote */
1763 						break;
1764 					} else {
1765 						*fr++ = *s++;
1766 					}
1767 				}
1768 				*fr++ = 0;
1769 			} else {	/* unquoted field */
1770 				while (*s != ',' && *s != '\0')
1771 					*fr++ = *s++;
1772 				*fr++ = 0;
1773 			}
1774 			snprintf(num, sizeof(num), "%d", n);
1775 			if (is_number(newt, &result))
1776 				setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
1777 			else
1778 				setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
1779 			if (*s++ == '\0')
1780 				break;
1781 		}
1782 		free(newt);
1783 
1784 	} else if (!CSV && sep == ' ') { /* usual case: split on white space */
1785 		for (n = 0; ; ) {
1786 #define ISWS(c)	((c) == ' ' || (c) == '\t' || (c) == '\n')
1787 			while (ISWS(*s))
1788 				s++;
1789 			if (*s == '\0')
1790 				break;
1791 			n++;
1792 			t = s;
1793 			do
1794 				s++;
1795 			while (*s != '\0' && !ISWS(*s));
1796 			temp = *s;
1797 			setptr(s, '\0');
1798 			snprintf(num, sizeof(num), "%d", n);
1799 			if (is_number(t, & result))
1800 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1801 			else
1802 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1803 			setptr(s, temp);
1804 			if (*s != '\0')
1805 				s++;
1806 		}
1807 
1808 	} else if (sep == 0) {	/* new: split(s, a, "") => 1 char/elem */
1809 		for (n = 0; *s != '\0'; s += u8_nextlen(s)) {
1810 			char buf[10];
1811 			n++;
1812 			snprintf(num, sizeof(num), "%d", n);
1813 
1814 			for (j = 0; j < u8_nextlen(s); j++) {
1815 				buf[j] = s[j];
1816 			}
1817 			buf[j] = '\0';
1818 
1819 			if (isdigit((uschar)buf[0]))
1820 				setsymtab(num, buf, atof(buf), STR|NUM, (Array *) ap->sval);
1821 			else
1822 				setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);
1823 		}
1824 
1825 	} else if (*s != '\0') {  /* some random single character */
1826 		for (;;) {
1827 			n++;
1828 			t = s;
1829 			while (*s != sep && *s != '\n' && *s != '\0')
1830 				s++;
1831 			temp = *s;
1832 			setptr(s, '\0');
1833 			snprintf(num, sizeof(num), "%d", n);
1834 			if (is_number(t, & result))
1835 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1836 			else
1837 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1838 			setptr(s, temp);
1839 			if (*s++ == '\0')
1840 				break;
1841 		}
1842 	}
1843 	tempfree(ap);
1844 	xfree(origs);
1845 	xfree(origfs);
1846 	x = gettemp();
1847 	x->tval = NUM;
1848 	x->fval = n;
1849 	return(x);
1850 }
1851 
1852 Cell *condexpr(Node **a, int n)	/* a[0] ? a[1] : a[2] */
1853 {
1854 	Cell *x;
1855 
1856 	x = execute(a[0]);
1857 	if (istrue(x)) {
1858 		tempfree(x);
1859 		x = execute(a[1]);
1860 	} else {
1861 		tempfree(x);
1862 		x = execute(a[2]);
1863 	}
1864 	return(x);
1865 }
1866 
1867 Cell *ifstat(Node **a, int n)	/* if (a[0]) a[1]; else a[2] */
1868 {
1869 	Cell *x;
1870 
1871 	x = execute(a[0]);
1872 	if (istrue(x)) {
1873 		tempfree(x);
1874 		x = execute(a[1]);
1875 	} else if (a[2] != NULL) {
1876 		tempfree(x);
1877 		x = execute(a[2]);
1878 	}
1879 	return(x);
1880 }
1881 
1882 Cell *whilestat(Node **a, int n)	/* while (a[0]) a[1] */
1883 {
1884 	Cell *x;
1885 
1886 	for (;;) {
1887 		x = execute(a[0]);
1888 		if (!istrue(x))
1889 			return(x);
1890 		tempfree(x);
1891 		x = execute(a[1]);
1892 		if (isbreak(x)) {
1893 			x = True;
1894 			return(x);
1895 		}
1896 		if (isnext(x) || isexit(x) || isret(x))
1897 			return(x);
1898 		tempfree(x);
1899 	}
1900 }
1901 
1902 Cell *dostat(Node **a, int n)	/* do a[0]; while(a[1]) */
1903 {
1904 	Cell *x;
1905 
1906 	for (;;) {
1907 		x = execute(a[0]);
1908 		if (isbreak(x))
1909 			return True;
1910 		if (isnext(x) || isexit(x) || isret(x))
1911 			return(x);
1912 		tempfree(x);
1913 		x = execute(a[1]);
1914 		if (!istrue(x))
1915 			return(x);
1916 		tempfree(x);
1917 	}
1918 }
1919 
1920 Cell *forstat(Node **a, int n)	/* for (a[0]; a[1]; a[2]) a[3] */
1921 {
1922 	Cell *x;
1923 
1924 	x = execute(a[0]);
1925 	tempfree(x);
1926 	for (;;) {
1927 		if (a[1]!=NULL) {
1928 			x = execute(a[1]);
1929 			if (!istrue(x)) return(x);
1930 			else tempfree(x);
1931 		}
1932 		x = execute(a[3]);
1933 		if (isbreak(x))		/* turn off break */
1934 			return True;
1935 		if (isnext(x) || isexit(x) || isret(x))
1936 			return(x);
1937 		tempfree(x);
1938 		x = execute(a[2]);
1939 		tempfree(x);
1940 	}
1941 }
1942 
1943 Cell *instat(Node **a, int n)	/* for (a[0] in a[1]) a[2] */
1944 {
1945 	Cell *x, *vp, *arrayp, *cp, *ncp;
1946 	Array *tp;
1947 	int i;
1948 
1949 	vp = execute(a[0]);
1950 	arrayp = execute(a[1]);
1951 	if (!isarr(arrayp)) {
1952 		return True;
1953 	}
1954 	tp = (Array *) arrayp->sval;
1955 	tempfree(arrayp);
1956 	for (i = 0; i < tp->size; i++) {	/* this routine knows too much */
1957 		for (cp = tp->tab[i]; cp != NULL; cp = ncp) {
1958 			setsval(vp, cp->nval);
1959 			ncp = cp->cnext;
1960 			x = execute(a[2]);
1961 			if (isbreak(x)) {
1962 				tempfree(vp);
1963 				return True;
1964 			}
1965 			if (isnext(x) || isexit(x) || isret(x)) {
1966 				tempfree(vp);
1967 				return(x);
1968 			}
1969 			tempfree(x);
1970 		}
1971 	}
1972 	return True;
1973 }
1974 
1975 static char *nawk_convert(const char *s, int (*fun_c)(int),
1976     wint_t (*fun_wc)(wint_t))
1977 {
1978 	char *buf      = NULL;
1979 	char *pbuf     = NULL;
1980 	const char *ps = NULL;
1981 	size_t n       = 0;
1982 	wchar_t wc;
1983 	const size_t sz = awk_mb_cur_max;
1984 	int unused;
1985 
1986 	if (sz == 1) {
1987 		buf = tostring(s);
1988 
1989 		for (pbuf = buf; *pbuf; pbuf++)
1990 			*pbuf = fun_c((uschar)*pbuf);
1991 
1992 		return buf;
1993 	} else {
1994 		/* upper/lower character may be shorter/longer */
1995 		buf = tostringN(s, strlen(s) * sz + 1);
1996 
1997 		(void) mbtowc(NULL, NULL, 0);	/* reset internal state */
1998 		/*
1999 		 * Reset internal state here too.
2000 		 * Assign result to avoid a compiler warning. (Casting to void
2001 		 * doesn't work.)
2002 		 * Increment said variable to avoid a different warning.
2003 		 */
2004 		unused = wctomb(NULL, L'\0');
2005 		unused++;
2006 
2007 		ps   = s;
2008 		pbuf = buf;
2009 		while (n = mbtowc(&wc, ps, sz),
2010 		       n > 0 && n != (size_t)-1 && n != (size_t)-2)
2011 		{
2012 			ps += n;
2013 
2014 			n = wctomb(pbuf, fun_wc(wc));
2015 			if (n == (size_t)-1)
2016 				FATAL("illegal wide character %s", s);
2017 
2018 			pbuf += n;
2019 		}
2020 
2021 		*pbuf = '\0';
2022 
2023 		if (n)
2024 			FATAL("illegal byte sequence %s", s);
2025 
2026 		return buf;
2027 	}
2028 }
2029 
2030 #ifdef __DJGPP__
2031 static wint_t towupper(wint_t wc)
2032 {
2033 	if (wc >= 0 && wc < 256)
2034 		return toupper(wc & 0xFF);
2035 
2036 	return wc;
2037 }
2038 
2039 static wint_t towlower(wint_t wc)
2040 {
2041 	if (wc >= 0 && wc < 256)
2042 		return tolower(wc & 0xFF);
2043 
2044 	return wc;
2045 }
2046 #endif
2047 
2048 static char *nawk_toupper(const char *s)
2049 {
2050 	return nawk_convert(s, toupper, towupper);
2051 }
2052 
2053 static char *nawk_tolower(const char *s)
2054 {
2055 	return nawk_convert(s, tolower, towlower);
2056 }
2057 
2058 
2059 
2060 Cell *bltin(Node **a, int n)	/* builtin functions. a[0] is type, a[1] is arg list */
2061 {
2062 	Cell *x, *y;
2063 	Awkfloat u;
2064 	int t, sz;
2065 	Awkfloat tmp;
2066 	char *buf, *fmt;
2067 	Node *nextarg;
2068 	FILE *fp;
2069 	int status = 0;
2070 	time_t tv;
2071 	struct tm *tm;
2072 
2073 	t = ptoi(a[0]);
2074 	x = execute(a[1]);
2075 	nextarg = a[1]->nnext;
2076 	switch (t) {
2077 	case FLENGTH:
2078 		if (isarr(x))
2079 			u = ((Array *) x->sval)->nelem;	/* GROT.  should be function*/
2080 		else
2081 			u = u8_strlen(getsval(x));
2082 		break;
2083 	case FLOG:
2084 		errno = 0;
2085 		u = errcheck(log(getfval(x)), "log");
2086 		break;
2087 	case FINT:
2088 		modf(getfval(x), &u); break;
2089 	case FEXP:
2090 		errno = 0;
2091 		u = errcheck(exp(getfval(x)), "exp");
2092 		break;
2093 	case FSQRT:
2094 		errno = 0;
2095 		u = errcheck(sqrt(getfval(x)), "sqrt");
2096 		break;
2097 	case FSIN:
2098 		u = sin(getfval(x)); break;
2099 	case FCOS:
2100 		u = cos(getfval(x)); break;
2101 	case FATAN:
2102 		if (nextarg == NULL) {
2103 			WARNING("atan2 requires two arguments; returning 1.0");
2104 			u = 1.0;
2105 		} else {
2106 			y = execute(a[1]->nnext);
2107 			u = atan2(getfval(x), getfval(y));
2108 			tempfree(y);
2109 			nextarg = nextarg->nnext;
2110 		}
2111 		break;
2112 	case FCOMPL:
2113 		u = ~((int)getfval(x));
2114 		break;
2115 	case FAND:
2116 		if (nextarg == 0) {
2117 			WARNING("and requires two arguments; returning 0");
2118 			u = 0;
2119 			break;
2120 		}
2121 		y = execute(a[1]->nnext);
2122 		u = ((int)getfval(x)) & ((int)getfval(y));
2123 		tempfree(y);
2124 		nextarg = nextarg->nnext;
2125 		break;
2126 	case FFOR:
2127 		if (nextarg == 0) {
2128 			WARNING("or requires two arguments; returning 0");
2129 			u = 0;
2130 			break;
2131 		}
2132 		y = execute(a[1]->nnext);
2133 		u = ((int)getfval(x)) | ((int)getfval(y));
2134 		tempfree(y);
2135 		nextarg = nextarg->nnext;
2136 		break;
2137 	case FXOR:
2138 		if (nextarg == 0) {
2139 			WARNING("xor requires two arguments; returning 0");
2140 			u = 0;
2141 			break;
2142 		}
2143 		y = execute(a[1]->nnext);
2144 		u = ((int)getfval(x)) ^ ((int)getfval(y));
2145 		tempfree(y);
2146 		nextarg = nextarg->nnext;
2147 		break;
2148 	case FLSHIFT:
2149 		if (nextarg == 0) {
2150 			WARNING("lshift requires two arguments; returning 0");
2151 			u = 0;
2152 			break;
2153 		}
2154 		y = execute(a[1]->nnext);
2155 		u = ((int)getfval(x)) << ((int)getfval(y));
2156 		tempfree(y);
2157 		nextarg = nextarg->nnext;
2158 		break;
2159 	case FRSHIFT:
2160 		if (nextarg == 0) {
2161 			WARNING("rshift requires two arguments; returning 0");
2162 			u = 0;
2163 			break;
2164 		}
2165 		y = execute(a[1]->nnext);
2166 		u = ((int)getfval(x)) >> ((int)getfval(y));
2167 		tempfree(y);
2168 		nextarg = nextarg->nnext;
2169 		break;
2170 	case FSYSTEM:
2171 		fflush(stdout);		/* in case something is buffered already */
2172 		status = system(getsval(x));
2173 		u = status;
2174 		if (status != -1) {
2175 			if (WIFEXITED(status)) {
2176 				u = WEXITSTATUS(status);
2177 			} else if (WIFSIGNALED(status)) {
2178 				u = WTERMSIG(status) + 256;
2179 #ifdef WCOREDUMP
2180 				if (WCOREDUMP(status))
2181 					u += 256;
2182 #endif
2183 			} else	/* something else?!? */
2184 				u = 0;
2185 		}
2186 		break;
2187 	case FRAND:
2188 		/* random() returns numbers in [0..2^31-1]
2189 		 * in order to get a number in [0, 1), divide it by 2^31
2190 		 */
2191 		u = (Awkfloat) random() / (0x7fffffffL + 0x1UL);
2192 		break;
2193 	case FSRAND:
2194 		if (isrec(x))	/* no argument provided */
2195 			u = time((time_t *)0);
2196 		else
2197 			u = getfval(x);
2198 		tmp = u;
2199 		srandom((unsigned long) u);
2200 		u = srand_seed;
2201 		srand_seed = tmp;
2202 		break;
2203 	case FTOUPPER:
2204 	case FTOLOWER:
2205 		if (t == FTOUPPER)
2206 			buf = nawk_toupper(getsval(x));
2207 		else
2208 			buf = nawk_tolower(getsval(x));
2209 		tempfree(x);
2210 		x = gettemp();
2211 		setsval(x, buf);
2212 		free(buf);
2213 		return x;
2214 	case FFLUSH:
2215 		if (isrec(x) || strlen(getsval(x)) == 0) {
2216 			flush_all();	/* fflush() or fflush("") -> all */
2217 			u = 0;
2218 		} else if ((fp = openfile(FFLUSH, getsval(x), NULL)) == NULL)
2219 			u = EOF;
2220 		else
2221 			u = fflush(fp);
2222 		break;
2223 	case FSYSTIME:
2224 		u = time((time_t *) 0);
2225 		break;
2226 	case FSTRFTIME:
2227 		/* strftime([format [,timestamp]]) */
2228 		if (nextarg) {
2229 			y = execute(nextarg);
2230 			nextarg = nextarg->nnext;
2231 			tv = (time_t) getfval(y);
2232 			tempfree(y);
2233 		} else
2234 			tv = time((time_t *) 0);
2235 		tm = localtime(&tv);
2236 		if (tm == NULL)
2237 			FATAL("bad time %ld", (long)tv);
2238 
2239 		if (isrec(x)) {
2240 			/* format argument not provided, use default */
2241 			fmt = tostring("%a %b %d %H:%M:%S %Z %Y");
2242 		} else
2243 			fmt = tostring(getsval(x));
2244 
2245 		sz = 32;
2246 		buf = NULL;
2247 		do {
2248 			if ((buf = realloc(buf, (sz *= 2))) == NULL)
2249 				FATAL("out of memory in strftime");
2250 		} while (strftime(buf, sz, fmt, tm) == 0 && fmt[0] != '\0');
2251 
2252 		y = gettemp();
2253 		setsval(y, buf);
2254 		free(fmt);
2255 		free(buf);
2256 
2257 		return y;
2258 	default:	/* can't happen */
2259 		FATAL("illegal function type %d", t);
2260 		break;
2261 	}
2262 	tempfree(x);
2263 	x = gettemp();
2264 	setfval(x, u);
2265 	if (nextarg != NULL) {
2266 		WARNING("warning: function has too many arguments");
2267 		for ( ; nextarg; nextarg = nextarg->nnext) {
2268 			y = execute(nextarg);
2269 			tempfree(y);
2270 		}
2271 	}
2272 	return(x);
2273 }
2274 
2275 Cell *printstat(Node **a, int n)	/* print a[0] */
2276 {
2277 	Node *x;
2278 	Cell *y;
2279 	FILE *fp;
2280 
2281 	if (a[1] == NULL)	/* a[1] is redirection operator, a[2] is file */
2282 		fp = stdout;
2283 	else
2284 		fp = redirect(ptoi(a[1]), a[2]);
2285 	for (x = a[0]; x != NULL; x = x->nnext) {
2286 		y = execute(x);
2287 		fputs(getpssval(y), fp);
2288 		tempfree(y);
2289 		if (x->nnext == NULL)
2290 			fputs(getsval(orsloc), fp);
2291 		else
2292 			fputs(getsval(ofsloc), fp);
2293 	}
2294 	if (a[1] != NULL)
2295 		fflush(fp);
2296 	if (ferror(fp))
2297 		FATAL("write error on %s", filename(fp));
2298 	return(True);
2299 }
2300 
2301 Cell *nullproc(Node **a, int n)
2302 {
2303 	return 0;
2304 }
2305 
2306 
2307 FILE *redirect(int a, Node *b)	/* set up all i/o redirections */
2308 {
2309 	FILE *fp;
2310 	Cell *x;
2311 	char *fname;
2312 
2313 	x = execute(b);
2314 	fname = getsval(x);
2315 	fp = openfile(a, fname, NULL);
2316 	if (fp == NULL)
2317 		FATAL("can't open file %s", fname);
2318 	tempfree(x);
2319 	return fp;
2320 }
2321 
2322 struct files {
2323 	FILE	*fp;
2324 	const char	*fname;
2325 	int	mode;	/* '|', 'a', 'w' => LE/LT, GT */
2326 } *files;
2327 
2328 size_t nfiles;
2329 
2330 static void stdinit(void)	/* in case stdin, etc., are not constants */
2331 {
2332 	nfiles = FOPEN_MAX;
2333 	files = (struct files *) calloc(nfiles, sizeof(*files));
2334 	if (files == NULL)
2335 		FATAL("can't allocate file memory for %zu files", nfiles);
2336         files[0].fp = stdin;
2337 	files[0].fname = tostring("/dev/stdin");
2338 	files[0].mode = LT;
2339         files[1].fp = stdout;
2340 	files[1].fname = tostring("/dev/stdout");
2341 	files[1].mode = GT;
2342         files[2].fp = stderr;
2343 	files[2].fname = tostring("/dev/stderr");
2344 	files[2].mode = GT;
2345 }
2346 
2347 FILE *openfile(int a, const char *us, bool *pnewflag)
2348 {
2349 	const char *s = us;
2350 	size_t i;
2351 	int m;
2352 	FILE *fp = NULL;
2353 
2354 	if (*s == '\0')
2355 		FATAL("null file name in print or getline");
2356 	for (i = 0; i < nfiles; i++)
2357 		if (files[i].fname && strcmp(s, files[i].fname) == 0 &&
2358 		    (a == files[i].mode || (a==APPEND && files[i].mode==GT) ||
2359 		     a == FFLUSH)) {
2360 			if (pnewflag)
2361 				*pnewflag = false;
2362 			return files[i].fp;
2363 		}
2364 	if (a == FFLUSH)	/* didn't find it, so don't create it! */
2365 		return NULL;
2366 
2367 	for (i = 0; i < nfiles; i++)
2368 		if (files[i].fp == NULL)
2369 			break;
2370 	if (i >= nfiles) {
2371 		struct files *nf;
2372 		size_t nnf = nfiles + FOPEN_MAX;
2373 		nf = (struct files *) realloc(files, nnf * sizeof(*nf));
2374 		if (nf == NULL)
2375 			FATAL("cannot grow files for %s and %zu files", s, nnf);
2376 		memset(&nf[nfiles], 0, FOPEN_MAX * sizeof(*nf));
2377 		nfiles = nnf;
2378 		files = nf;
2379 	}
2380 	fflush(stdout);	/* force a semblance of order */
2381 	m = a;
2382 	if (a == GT) {
2383 		fp = fopen(s, "w");
2384 	} else if (a == APPEND) {
2385 		fp = fopen(s, "a");
2386 		m = GT;	/* so can mix > and >> */
2387 	} else if (a == '|') {	/* output pipe */
2388 		fp = popen(s, "w");
2389 	} else if (a == LE) {	/* input pipe */
2390 		fp = popen(s, "r");
2391 	} else if (a == LT) {	/* getline <file */
2392 		fp = strcmp(s, "-") == 0 ? stdin : fopen(s, "r");	/* "-" is stdin */
2393 	} else	/* can't happen */
2394 		FATAL("illegal redirection %d", a);
2395 	if (fp != NULL) {
2396 		files[i].fname = tostring(s);
2397 		files[i].fp = fp;
2398 		files[i].mode = m;
2399 		if (pnewflag)
2400 			*pnewflag = true;
2401 		if (fp != stdin && fp != stdout && fp != stderr)
2402 			(void) fcntl(fileno(fp), F_SETFD, FD_CLOEXEC);
2403 	}
2404 	return fp;
2405 }
2406 
2407 const char *filename(FILE *fp)
2408 {
2409 	size_t i;
2410 
2411 	for (i = 0; i < nfiles; i++)
2412 		if (fp == files[i].fp)
2413 			return files[i].fname;
2414 	return "???";
2415 }
2416 
2417 Cell *closefile(Node **a, int n)
2418 {
2419  	Cell *x;
2420 	size_t i;
2421 	bool stat;
2422 
2423  	x = execute(a[0]);
2424  	getsval(x);
2425 	stat = true;
2426  	for (i = 0; i < nfiles; i++) {
2427 		if (!files[i].fname || strcmp(x->sval, files[i].fname) != 0)
2428 			continue;
2429 		if (files[i].mode == GT || files[i].mode == '|')
2430 			fflush(files[i].fp);
2431 		if (ferror(files[i].fp)) {
2432 			if ((files[i].mode == GT && files[i].fp != stderr)
2433 			  || files[i].mode == '|')
2434 				FATAL("write error on %s", files[i].fname);
2435 			else
2436 				WARNING("i/o error occurred on %s", files[i].fname);
2437 		}
2438 		if (files[i].fp == stdin || files[i].fp == stdout ||
2439 		    files[i].fp == stderr)
2440 			stat = freopen("/dev/null", "r+", files[i].fp) == NULL;
2441 		else if (files[i].mode == '|' || files[i].mode == LE)
2442 			stat = pclose(files[i].fp) == -1;
2443 		else
2444 			stat = fclose(files[i].fp) == EOF;
2445 		if (stat)
2446 			WARNING("i/o error occurred closing %s", files[i].fname);
2447 		xfree(files[i].fname);
2448 		files[i].fname = NULL;	/* watch out for ref thru this */
2449 		files[i].fp = NULL;
2450 		break;
2451  	}
2452  	tempfree(x);
2453  	x = gettemp();
2454 	setfval(x, (Awkfloat) (stat ? -1 : 0));
2455  	return(x);
2456 }
2457 
2458 void closeall(void)
2459 {
2460 	size_t i;
2461 	bool stat = false;
2462 
2463 	for (i = 0; i < nfiles; i++) {
2464 		if (! files[i].fp)
2465 			continue;
2466 		if (files[i].mode == GT || files[i].mode == '|')
2467 			fflush(files[i].fp);
2468 		if (ferror(files[i].fp)) {
2469 			if ((files[i].mode == GT && files[i].fp != stderr)
2470 			  || files[i].mode == '|')
2471 				FATAL("write error on %s", files[i].fname);
2472 			else
2473 				WARNING("i/o error occurred on %s", files[i].fname);
2474 		}
2475 		if (files[i].fp == stdin || files[i].fp == stdout ||
2476 		    files[i].fp == stderr)
2477 			continue;
2478 		if (files[i].mode == '|' || files[i].mode == LE)
2479 			stat = pclose(files[i].fp) == -1;
2480 		else
2481 			stat = fclose(files[i].fp) == EOF;
2482 		if (stat)
2483 			WARNING("i/o error occurred while closing %s", files[i].fname);
2484 	}
2485 }
2486 
2487 static void flush_all(void)
2488 {
2489 	size_t i;
2490 
2491 	for (i = 0; i < nfiles; i++)
2492 		if (files[i].fp)
2493 			fflush(files[i].fp);
2494 }
2495 
2496 void backsub(char **pb_ptr, const char **sptr_ptr);
2497 
2498 Cell *sub(Node **a, int nnn)	/* substitute command */
2499 {
2500 	const char *sptr, *q;
2501 	Cell *x, *y, *result;
2502 	char *t, *buf, *pb;
2503 	fa *pfa;
2504 	int bufsz = recsize;
2505 
2506 	if ((buf = (char *) malloc(bufsz)) == NULL)
2507 		FATAL("out of memory in sub");
2508 	x = execute(a[3]);	/* target string */
2509 	t = getsval(x);
2510 	if (a[0] == NULL)	/* 0 => a[1] is already-compiled regexpr */
2511 		pfa = (fa *) a[1];	/* regular expression */
2512 	else {
2513 		y = execute(a[1]);
2514 		pfa = makedfa(getsval(y), 1);
2515 		tempfree(y);
2516 	}
2517 	y = execute(a[2]);	/* replacement string */
2518 	result = False;
2519 	if (pmatch(pfa, t)) {
2520 		sptr = t;
2521 		adjbuf(&buf, &bufsz, 1+patbeg-sptr, recsize, 0, "sub");
2522 		pb = buf;
2523 		while (sptr < patbeg)
2524 			*pb++ = *sptr++;
2525 		sptr = getsval(y);
2526 		while (*sptr != '\0') {
2527 			adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "sub");
2528 			if (*sptr == '\\') {
2529 				backsub(&pb, &sptr);
2530 			} else if (*sptr == '&') {
2531 				sptr++;
2532 				adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "sub");
2533 				for (q = patbeg; q < patbeg+patlen; )
2534 					*pb++ = *q++;
2535 			} else
2536 				*pb++ = *sptr++;
2537 		}
2538 		*pb = '\0';
2539 		if (pb > buf + bufsz)
2540 			FATAL("sub result1 %.30s too big; can't happen", buf);
2541 		sptr = patbeg + patlen;
2542 		if ((patlen == 0 && *patbeg) || (patlen && *(sptr-1))) {
2543 			adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "sub");
2544 			while ((*pb++ = *sptr++) != '\0')
2545 				continue;
2546 		}
2547 		if (pb > buf + bufsz)
2548 			FATAL("sub result2 %.30s too big; can't happen", buf);
2549 		setsval(x, buf);	/* BUG: should be able to avoid copy */
2550 		result = True;
2551 	}
2552 	tempfree(x);
2553 	tempfree(y);
2554 	free(buf);
2555 	return result;
2556 }
2557 
2558 Cell *gsub(Node **a, int nnn)	/* global substitute */
2559 {
2560 	Cell *x, *y;
2561 	char *rptr, *pb;
2562 	const char *q, *t, *sptr;
2563 	char *buf;
2564 	fa *pfa;
2565 	int mflag, tempstat, num;
2566 	int bufsz = recsize;
2567 	int charlen = 0;
2568 
2569 	if ((buf = (char *) malloc(bufsz)) == NULL)
2570 		FATAL("out of memory in gsub");
2571 	mflag = 0;	/* if mflag == 0, can replace empty string */
2572 	num = 0;
2573 	x = execute(a[3]);	/* target string */
2574 	t = getsval(x);
2575 	if (a[0] == NULL)	/* 0 => a[1] is already-compiled regexpr */
2576 		pfa = (fa *) a[1];	/* regular expression */
2577 	else {
2578 		y = execute(a[1]);
2579 		pfa = makedfa(getsval(y), 1);
2580 		tempfree(y);
2581 	}
2582 	y = execute(a[2]);	/* replacement string */
2583 	if (pmatch(pfa, t)) {
2584 		tempstat = pfa->initstat;
2585 		pfa->initstat = 2;
2586 		pb = buf;
2587 		rptr = getsval(y);
2588 		do {
2589 			if (patlen == 0 && *patbeg != '\0') {	/* matched empty string */
2590 				if (mflag == 0) {	/* can replace empty */
2591 					num++;
2592 					sptr = rptr;
2593 					while (*sptr != '\0') {
2594 						adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
2595 						if (*sptr == '\\') {
2596 							backsub(&pb, &sptr);
2597 						} else if (*sptr == '&') {
2598 							sptr++;
2599 							adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
2600 							for (q = patbeg; q < patbeg+patlen; )
2601 								*pb++ = *q++;
2602 						} else
2603 							*pb++ = *sptr++;
2604 					}
2605 				}
2606 				if (*t == '\0')	/* at end */
2607 					goto done;
2608 				adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gsub");
2609 				charlen = u8_nextlen(t);
2610 				while (charlen-- > 0)
2611 					*pb++ = *t++;
2612 				if (pb > buf + bufsz)	/* BUG: not sure of this test */
2613 					FATAL("gsub result0 %.30s too big; can't happen", buf);
2614 				mflag = 0;
2615 			}
2616 			else {	/* matched nonempty string */
2617 				num++;
2618 				sptr = t;
2619 				adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gsub");
2620 				while (sptr < patbeg)
2621 					*pb++ = *sptr++;
2622 				sptr = rptr;
2623 				while (*sptr != '\0') {
2624 					adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
2625 					if (*sptr == '\\') {
2626 						backsub(&pb, &sptr);
2627 					} else if (*sptr == '&') {
2628 						sptr++;
2629 						adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
2630 						for (q = patbeg; q < patbeg+patlen; )
2631 							*pb++ = *q++;
2632 					} else
2633 						*pb++ = *sptr++;
2634 				}
2635 				t = patbeg + patlen;
2636 				if (patlen == 0 || *t == '\0' || *(t-1) == '\0')
2637 					goto done;
2638 				if (pb > buf + bufsz)
2639 					FATAL("gsub result1 %.30s too big; can't happen", buf);
2640 				mflag = 1;
2641 			}
2642 		} while (pmatch(pfa,t));
2643 		sptr = t;
2644 		adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gsub");
2645 		while ((*pb++ = *sptr++) != '\0')
2646 			continue;
2647 	done:	if (pb < buf + bufsz)
2648 			*pb = '\0';
2649 		else if (*(pb-1) != '\0')
2650 			FATAL("gsub result2 %.30s truncated; can't happen", buf);
2651 		setsval(x, buf);	/* BUG: should be able to avoid copy + free */
2652 		pfa->initstat = tempstat;
2653 	}
2654 	tempfree(x);
2655 	tempfree(y);
2656 	x = gettemp();
2657 	x->tval = NUM;
2658 	x->fval = num;
2659 	free(buf);
2660 	return(x);
2661 }
2662 
2663 Cell *gensub(Node **a, int nnn)	/* global selective substitute */
2664 	/* XXX incomplete - doesn't support backreferences \0 ... \9 */
2665 {
2666 	Cell *x, *y, *res, *h;
2667 	char *rptr;
2668 	const char *sptr;
2669 	char *buf, *pb;
2670 	const char *t, *q;
2671 	fa *pfa;
2672 	int mflag, tempstat, num, whichm;
2673 	int bufsz = recsize;
2674 
2675 	if ((buf = malloc(bufsz)) == NULL)
2676 		FATAL("out of memory in gensub");
2677 	mflag = 0;	/* if mflag == 0, can replace empty string */
2678 	num = 0;
2679 	x = execute(a[4]);	/* source string */
2680 	t = getsval(x);
2681 	res = copycell(x);	/* target string - initially copy of source */
2682 	res->csub = CTEMP;	/* result values are temporary */
2683 	if (a[0] == 0)		/* 0 => a[1] is already-compiled regexpr */
2684 		pfa = (fa *) a[1];	/* regular expression */
2685 	else {
2686 		y = execute(a[1]);
2687 		pfa = makedfa(getsval(y), 1);
2688 		tempfree(y);
2689 	}
2690 	y = execute(a[2]);	/* replacement string */
2691 	h = execute(a[3]);	/* which matches should be replaced */
2692 	sptr = getsval(h);
2693 	if (sptr[0] == 'g' || sptr[0] == 'G')
2694 		whichm = -1;
2695 	else {
2696 		/*
2697 		 * The specified number is index of replacement, starting
2698 		 * from 1. GNU awk treats index lower than 0 same as
2699 		 * 1, we do same for compatibility.
2700 		 */
2701 		whichm = (int) getfval(h) - 1;
2702 		if (whichm < 0)
2703 			whichm = 0;
2704 	}
2705 	tempfree(h);
2706 
2707 	if (pmatch(pfa, t)) {
2708 		char *sl;
2709 
2710 		tempstat = pfa->initstat;
2711 		pfa->initstat = 2;
2712 		pb = buf;
2713 		rptr = getsval(y);
2714 		/*
2715 		 * XXX if there are any backreferences in subst string,
2716 		 * complain now.
2717 		 */
2718 		for (sl = rptr; (sl = strchr(sl, '\\')) && sl[1]; sl++) {
2719 			if (strchr("0123456789", sl[1])) {
2720 				FATAL("gensub doesn't support backreferences (subst \"%s\")", rptr);
2721 			}
2722 		}
2723 
2724 		do {
2725 			if (whichm >= 0 && whichm != num) {
2726 				num++;
2727 				adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - t) + patlen, recsize, &pb, "gensub");
2728 
2729 				/* copy the part of string up to and including
2730 				 * match to output buffer */
2731 				while (t < patbeg + patlen)
2732 					*pb++ = *t++;
2733 				continue;
2734 			}
2735 
2736 			if (patlen == 0 && *patbeg != 0) {	/* matched empty string */
2737 				if (mflag == 0) {	/* can replace empty */
2738 					num++;
2739 					sptr = rptr;
2740 					while (*sptr != 0) {
2741 						adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2742 						if (*sptr == '\\') {
2743 							backsub(&pb, &sptr);
2744 						} else if (*sptr == '&') {
2745 							sptr++;
2746 							adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2747 							for (q = patbeg; q < patbeg+patlen; )
2748 								*pb++ = *q++;
2749 						} else
2750 							*pb++ = *sptr++;
2751 					}
2752 				}
2753 				if (*t == 0)	/* at end */
2754 					goto done;
2755 				adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gensub");
2756 				*pb++ = *t++;
2757 				if (pb > buf + bufsz)	/* BUG: not sure of this test */
2758 					FATAL("gensub result0 %.30s too big; can't happen", buf);
2759 				mflag = 0;
2760 			}
2761 			else {	/* matched nonempty string */
2762 				num++;
2763 				sptr = t;
2764 				adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gensub");
2765 				while (sptr < patbeg)
2766 					*pb++ = *sptr++;
2767 				sptr = rptr;
2768 				while (*sptr != 0) {
2769 					adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2770 					if (*sptr == '\\') {
2771 						backsub(&pb, &sptr);
2772 					} else if (*sptr == '&') {
2773 						sptr++;
2774 						adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2775 						for (q = patbeg; q < patbeg+patlen; )
2776 							*pb++ = *q++;
2777 					} else
2778 						*pb++ = *sptr++;
2779 				}
2780 				t = patbeg + patlen;
2781 				if (patlen == 0 || *t == 0 || *(t-1) == 0)
2782 					goto done;
2783 				if (pb > buf + bufsz)
2784 					FATAL("gensub result1 %.30s too big; can't happen", buf);
2785 				mflag = 1;
2786 			}
2787 		} while (pmatch(pfa,t));
2788 		sptr = t;
2789 		adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gensub");
2790 		while ((*pb++ = *sptr++) != 0)
2791 			;
2792 	done:	if (pb > buf + bufsz)
2793 			FATAL("gensub result2 %.30s too big; can't happen", buf);
2794 		*pb = '\0';
2795 		setsval(res, buf);
2796 		pfa->initstat = tempstat;
2797 	}
2798 	tempfree(x);
2799 	tempfree(y);
2800 	free(buf);
2801 	return(res);
2802 }
2803 
2804 void backsub(char **pb_ptr, const char **sptr_ptr)	/* handle \\& variations */
2805 {						/* sptr[0] == '\\' */
2806 	char *pb = *pb_ptr;
2807 	const char *sptr = *sptr_ptr;
2808 	static bool first = true;
2809 	static bool do_posix = false;
2810 
2811 	if (first) {
2812 		first = false;
2813 		do_posix = (getenv("POSIXLY_CORRECT") != NULL);
2814 	}
2815 
2816 	if (sptr[1] == '\\') {
2817 		if (sptr[2] == '\\' && sptr[3] == '&') { /* \\\& -> \& */
2818 			*pb++ = '\\';
2819 			*pb++ = '&';
2820 			sptr += 4;
2821 		} else if (sptr[2] == '&') {	/* \\& -> \ + matched */
2822 			*pb++ = '\\';
2823 			sptr += 2;
2824 		} else if (do_posix) {		/* \\x -> \x */
2825 			sptr++;
2826 			*pb++ = *sptr++;
2827 		} else {			/* \\x -> \\x */
2828 			*pb++ = *sptr++;
2829 			*pb++ = *sptr++;
2830 		}
2831 	} else if (sptr[1] == '&') {	/* literal & */
2832 		sptr++;
2833 		*pb++ = *sptr++;
2834 	} else				/* literal \ */
2835 		*pb++ = *sptr++;
2836 
2837 	*pb_ptr = pb;
2838 	*sptr_ptr = sptr;
2839 }
2840 
2841 static char *wide_char_to_byte_str(int rune, size_t *outlen)
2842 {
2843 	static char buf[5];
2844 	int len;
2845 
2846 	if (rune < 0 || rune > 0x10FFFF)
2847 		return NULL;
2848 
2849 	memset(buf, 0, sizeof(buf));
2850 
2851 	len = 0;
2852 	if (rune <= 0x0000007F) {
2853 		buf[len++] = rune;
2854 	} else if (rune <= 0x000007FF) {
2855 		// 110xxxxx 10xxxxxx
2856 		buf[len++] = 0xC0 | (rune >> 6);
2857 		buf[len++] = 0x80 | (rune & 0x3F);
2858 	} else if (rune <= 0x0000FFFF) {
2859 		// 1110xxxx 10xxxxxx 10xxxxxx
2860 		buf[len++] = 0xE0 | (rune >> 12);
2861 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2862 		buf[len++] = 0x80 | (rune & 0x3F);
2863 
2864 	} else {
2865 		// 0x00010000 - 0x10FFFF
2866 		// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2867 		buf[len++] = 0xF0 | (rune >> 18);
2868 		buf[len++] = 0x80 | ((rune >> 12) & 0x3F);
2869 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2870 		buf[len++] = 0x80 | (rune & 0x3F);
2871 	}
2872 
2873 	*outlen = len;
2874 	buf[len++] = '\0';
2875 
2876 	return buf;
2877 }
2878