xref: /freebsd/contrib/one-true-awk/run.c (revision b45a181a74c816cfc553e8210954916887fb94dc)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #define DEBUG
26 #include <stdio.h>
27 #include <ctype.h>
28 #include <errno.h>
29 #include <wctype.h>
30 #include <fcntl.h>
31 #include <setjmp.h>
32 #include <limits.h>
33 #include <math.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include <time.h>
37 #include <sys/types.h>
38 #include <sys/stat.h>
39 #include <sys/wait.h>
40 #include "awk.h"
41 #include "awkgram.tab.h"
42 
43 
44 static void stdinit(void);
45 static void flush_all(void);
46 static char *wide_char_to_byte_str(int rune, size_t *outlen);
47 
48 #if 1
49 #define tempfree(x)	do { if (istemp(x)) tfree(x); } while (/*CONSTCOND*/0)
50 #else
tempfree(Cell * p)51 void tempfree(Cell *p) {
52 	if (p->ctype == OCELL && (p->csub < CUNK || p->csub > CFREE)) {
53 		WARNING("bad csub %d in Cell %d %s",
54 			p->csub, p->ctype, p->sval);
55 	}
56 	if (istemp(p))
57 		tfree(p);
58 }
59 #endif
60 
61 /* do we really need these? */
62 /* #ifdef _NFILE */
63 /* #ifndef FOPEN_MAX */
64 /* #define FOPEN_MAX _NFILE */
65 /* #endif */
66 /* #endif */
67 /*  */
68 /* #ifndef	FOPEN_MAX */
69 /* #define	FOPEN_MAX	40 */	/* max number of open files */
70 /* #endif */
71 /*  */
72 /* #ifndef RAND_MAX */
73 /* #define RAND_MAX	32767 */	/* all that ansi guarantees */
74 /* #endif */
75 
76 jmp_buf env;
77 extern	int	pairstack[];
78 extern	Awkfloat	srand_seed;
79 
80 Node	*winner = NULL;	/* root of parse tree */
81 Cell	*tmps;		/* free temporary cells for execution */
82 
83 static Cell	truecell	={ OBOOL, BTRUE, 0, 0, 1.0, NUM, NULL, NULL };
84 Cell	*True	= &truecell;
85 static Cell	falsecell	={ OBOOL, BFALSE, 0, 0, 0.0, NUM, NULL, NULL };
86 Cell	*False	= &falsecell;
87 static Cell	breakcell	={ OJUMP, JBREAK, 0, 0, 0.0, NUM, NULL, NULL };
88 Cell	*jbreak	= &breakcell;
89 static Cell	contcell	={ OJUMP, JCONT, 0, 0, 0.0, NUM, NULL, NULL };
90 Cell	*jcont	= &contcell;
91 static Cell	nextcell	={ OJUMP, JNEXT, 0, 0, 0.0, NUM, NULL, NULL };
92 Cell	*jnext	= &nextcell;
93 static Cell	nextfilecell	={ OJUMP, JNEXTFILE, 0, 0, 0.0, NUM, NULL, NULL };
94 Cell	*jnextfile	= &nextfilecell;
95 static Cell	exitcell	={ OJUMP, JEXIT, 0, 0, 0.0, NUM, NULL, NULL };
96 Cell	*jexit	= &exitcell;
97 static Cell	retcell		={ OJUMP, JRET, 0, 0, 0.0, NUM, NULL, NULL };
98 Cell	*jret	= &retcell;
99 static Cell	tempcell	={ OCELL, CTEMP, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
100 
101 Node	*curnode = NULL;	/* the node being executed, for debugging */
102 
103 /* buffer memory management */
adjbuf(char ** pbuf,int * psiz,int minlen,int quantum,char ** pbptr,const char * whatrtn)104 int adjbuf(char **pbuf, int *psiz, int minlen, int quantum, char **pbptr,
105 	const char *whatrtn)
106 /* pbuf:    address of pointer to buffer being managed
107  * psiz:    address of buffer size variable
108  * minlen:  minimum length of buffer needed
109  * quantum: buffer size quantum
110  * pbptr:   address of movable pointer into buffer, or 0 if none
111  * whatrtn: name of the calling routine if failure should cause fatal error
112  *
113  * return   0 for realloc failure, !=0 for success
114  */
115 {
116 	if (minlen > *psiz) {
117 		char *tbuf;
118 		int rminlen = quantum ? minlen % quantum : 0;
119 		int boff = pbptr ? *pbptr - *pbuf : 0;
120 		/* round up to next multiple of quantum */
121 		if (rminlen)
122 			minlen += quantum - rminlen;
123 		tbuf = (char *) realloc(*pbuf, minlen);
124 		DPRINTF("adjbuf %s: %d %d (pbuf=%p, tbuf=%p)\n", whatrtn, *psiz, minlen, (void*)*pbuf, (void*)tbuf);
125 		if (tbuf == NULL) {
126 			if (whatrtn)
127 				FATAL("out of memory in %s", whatrtn);
128 			return 0;
129 		}
130 		*pbuf = tbuf;
131 		*psiz = minlen;
132 		if (pbptr)
133 			*pbptr = tbuf + boff;
134 	}
135 	return 1;
136 }
137 
run(Node * a)138 void run(Node *a)	/* execution of parse tree starts here */
139 {
140 
141 	stdinit();
142 	execute(a);
143 	closeall();
144 }
145 
execute(Node * u)146 Cell *execute(Node *u)	/* execute a node of the parse tree */
147 {
148 	Cell *(*proc)(Node **, int);
149 	Cell *x;
150 	Node *a;
151 
152 	if (u == NULL)
153 		return(True);
154 	for (a = u; ; a = a->nnext) {
155 		curnode = a;
156 		if (isvalue(a)) {
157 			x = (Cell *) (a->narg[0]);
158 			if (isfld(x) && !donefld)
159 				fldbld();
160 			else if (isrec(x) && !donerec)
161 				recbld();
162 			return(x);
163 		}
164 		if (notlegal(a->nobj))	/* probably a Cell* but too risky to print */
165 			FATAL("illegal statement");
166 		proc = proctab[a->nobj-FIRSTTOKEN];
167 		x = (*proc)(a->narg, a->nobj);
168 		if (isfld(x) && !donefld)
169 			fldbld();
170 		else if (isrec(x) && !donerec)
171 			recbld();
172 		if (isexpr(a))
173 			return(x);
174 		if (isjump(x))
175 			return(x);
176 		if (a->nnext == NULL)
177 			return(x);
178 		tempfree(x);
179 	}
180 }
181 
182 
program(Node ** a,int n)183 Cell *program(Node **a, int n)	/* execute an awk program */
184 {				/* a[0] = BEGIN, a[1] = body, a[2] = END */
185 	Cell *x;
186 
187 	if (setjmp(env) != 0)
188 		goto ex;
189 	if (a[0]) {		/* BEGIN */
190 		x = execute(a[0]);
191 		if (isexit(x))
192 			return(True);
193 		if (isjump(x))
194 			FATAL("illegal break, continue, next or nextfile from BEGIN");
195 		tempfree(x);
196 	}
197 	if (a[1] || a[2])
198 		while (getrec(&record, &recsize, true) > 0) {
199 			x = execute(a[1]);
200 			if (isexit(x))
201 				break;
202 			tempfree(x);
203 		}
204   ex:
205 	if (setjmp(env) != 0)	/* handles exit within END */
206 		goto ex1;
207 	if (a[2]) {		/* END */
208 		x = execute(a[2]);
209 		if (isbreak(x) || isnext(x) || iscont(x))
210 			FATAL("illegal break, continue, next or nextfile from END");
211 		tempfree(x);
212 	}
213   ex1:
214 	return(True);
215 }
216 
217 struct Frame {	/* stack frame for awk function calls */
218 	int nargs;	/* number of arguments in this call */
219 	Cell *fcncell;	/* pointer to Cell for function */
220 	Cell **args;	/* pointer to array of arguments after execute */
221 	Cell *retval;	/* return value */
222 };
223 
224 #define	NARGS	50	/* max args in a call */
225 
226 struct Frame *frame = NULL;	/* base of stack frames; dynamically allocated */
227 int	nframe = 0;		/* number of frames allocated */
228 struct Frame *frp = NULL;	/* frame pointer. bottom level unused */
229 
call(Node ** a,int n)230 Cell *call(Node **a, int n)	/* function call.  very kludgy and fragile */
231 {
232 	static const Cell newcopycell = { OCELL, CCOPY, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
233 	int i, ncall, ndef;
234 	int freed = 0; /* handles potential double freeing when fcn & param share a tempcell */
235 	Node *x;
236 	Cell *args[NARGS], *oargs[NARGS];	/* BUG: fixed size arrays */
237 	Cell *y, *z, *fcn;
238 	char *s;
239 
240 	fcn = execute(a[0]);	/* the function itself */
241 	s = fcn->nval;
242 	if (!isfcn(fcn))
243 		FATAL("calling undefined function %s", s);
244 	if (frame == NULL) {
245 		frp = frame = (struct Frame *) calloc(nframe += 100, sizeof(*frame));
246 		if (frame == NULL)
247 			FATAL("out of space for stack frames calling %s", s);
248 	}
249 	for (ncall = 0, x = a[1]; x != NULL; x = x->nnext)	/* args in call */
250 		ncall++;
251 	ndef = (int) fcn->fval;			/* args in defn */
252 	DPRINTF("calling %s, %d args (%d in defn), frp=%d\n", s, ncall, ndef, (int) (frp-frame));
253 	if (ncall > ndef)
254 		WARNING("function %s called with %d args, uses only %d",
255 			s, ncall, ndef);
256 	if (ncall + ndef > NARGS)
257 		FATAL("function %s has %d arguments, limit %d", s, ncall+ndef, NARGS);
258 	for (i = 0, x = a[1]; x != NULL; i++, x = x->nnext) {	/* get call args */
259 		DPRINTF("evaluate args[%d], frp=%d:\n", i, (int) (frp-frame));
260 		y = execute(x);
261 		oargs[i] = y;
262 		DPRINTF("args[%d]: %s %f <%s>, t=%o\n",
263 			i, NN(y->nval), y->fval, isarr(y) ? "(array)" : NN(y->sval), y->tval);
264 		if (isfcn(y))
265 			FATAL("can't use function %s as argument in %s", y->nval, s);
266 		if (isarr(y))
267 			args[i] = y;	/* arrays by ref */
268 		else
269 			args[i] = copycell(y);
270 		tempfree(y);
271 	}
272 	for ( ; i < ndef; i++) {	/* add null args for ones not provided */
273 		args[i] = gettemp();
274 		*args[i] = newcopycell;
275 	}
276 	frp++;	/* now ok to up frame */
277 	if (frp >= frame + nframe) {
278 		int dfp = frp - frame;	/* old index */
279 		frame = (struct Frame *) realloc(frame, (nframe += 100) * sizeof(*frame));
280 		if (frame == NULL)
281 			FATAL("out of space for stack frames in %s", s);
282 		frp = frame + dfp;
283 	}
284 	frp->fcncell = fcn;
285 	frp->args = args;
286 	frp->nargs = ndef;	/* number defined with (excess are locals) */
287 	frp->retval = gettemp();
288 
289 	DPRINTF("start exec of %s, frp=%d\n", s, (int) (frp-frame));
290 	y = execute((Node *)(fcn->sval));	/* execute body */
291 	DPRINTF("finished exec of %s, frp=%d\n", s, (int) (frp-frame));
292 
293 	for (i = 0; i < ndef; i++) {
294 		Cell *t = frp->args[i];
295 		if (isarr(t)) {
296 			if (t->csub == CCOPY) {
297 				if (i >= ncall) {
298 					freesymtab(t);
299 					t->csub = CTEMP;
300 					tempfree(t);
301 				} else {
302 					oargs[i]->tval = t->tval;
303 					oargs[i]->tval &= ~(STR|NUM|DONTFREE);
304 					oargs[i]->sval = t->sval;
305 					tempfree(t);
306 				}
307 			}
308 		} else if (t != y) {	/* kludge to prevent freeing twice */
309 			t->csub = CTEMP;
310 			tempfree(t);
311 		} else if (t == y && t->csub == CCOPY) {
312 			t->csub = CTEMP;
313 			tempfree(t);
314 			freed = 1;
315 		}
316 	}
317 	tempfree(fcn);
318 	if (isexit(y) || isnext(y))
319 		return y;
320 	if (freed == 0) {
321 		tempfree(y);	/* don't free twice! */
322 	}
323 	z = frp->retval;			/* return value */
324 	DPRINTF("%s returns %g |%s| %o\n", s, getfval(z), getsval(z), z->tval);
325 	frp--;
326 	return(z);
327 }
328 
copycell(Cell * x)329 Cell *copycell(Cell *x)	/* make a copy of a cell in a temp */
330 {
331 	Cell *y;
332 
333 	/* copy is not constant or field */
334 
335 	y = gettemp();
336 	y->tval = x->tval & ~(CON|FLD|REC);
337 	y->csub = CCOPY;	/* prevents freeing until call is over */
338 	y->nval = x->nval;	/* BUG? */
339 	if (isstr(x) /* || x->ctype == OCELL */) {
340 		y->sval = tostring(x->sval);
341 		y->tval &= ~DONTFREE;
342 	} else
343 		y->tval |= DONTFREE;
344 	y->fval = x->fval;
345 	return y;
346 }
347 
arg(Node ** a,int n)348 Cell *arg(Node **a, int n)	/* nth argument of a function */
349 {
350 
351 	n = ptoi(a[0]);	/* argument number, counting from 0 */
352 	DPRINTF("arg(%d), frp->nargs=%d\n", n, frp->nargs);
353 	if (n+1 > frp->nargs)
354 		FATAL("argument #%d of function %s was not supplied",
355 			n+1, frp->fcncell->nval);
356 	return frp->args[n];
357 }
358 
jump(Node ** a,int n)359 Cell *jump(Node **a, int n)	/* break, continue, next, nextfile, return */
360 {
361 	Cell *y;
362 
363 	switch (n) {
364 	case EXIT:
365 		if (a[0] != NULL) {
366 			y = execute(a[0]);
367 			errorflag = (int) getfval(y);
368 			tempfree(y);
369 		}
370 		longjmp(env, 1);
371 	case RETURN:
372 		if (a[0] != NULL) {
373 			y = execute(a[0]);
374 			if ((y->tval & (STR|NUM)) == (STR|NUM)) {
375 				setsval(frp->retval, getsval(y));
376 				frp->retval->fval = getfval(y);
377 				frp->retval->tval |= NUM;
378 			}
379 			else if (y->tval & STR)
380 				setsval(frp->retval, getsval(y));
381 			else if (y->tval & NUM)
382 				setfval(frp->retval, getfval(y));
383 			else		/* can't happen */
384 				FATAL("bad type variable %d", y->tval);
385 			tempfree(y);
386 		}
387 		return(jret);
388 	case NEXT:
389 		return(jnext);
390 	case NEXTFILE:
391 		nextfile();
392 		return(jnextfile);
393 	case BREAK:
394 		return(jbreak);
395 	case CONTINUE:
396 		return(jcont);
397 	default:	/* can't happen */
398 		FATAL("illegal jump type %d", n);
399 	}
400 	return 0;	/* not reached */
401 }
402 
awkgetline(Node ** a,int n)403 Cell *awkgetline(Node **a, int n)	/* get next line from specific input */
404 {		/* a[0] is variable, a[1] is operator, a[2] is filename */
405 	Cell *r, *x;
406 	extern Cell **fldtab;
407 	FILE *fp;
408 	char *buf;
409 	int bufsize = recsize;
410 	int mode;
411 	bool newflag;
412 	double result;
413 
414 	if ((buf = (char *) malloc(bufsize)) == NULL)
415 		FATAL("out of memory in getline");
416 
417 	fflush(stdout);	/* in case someone is waiting for a prompt */
418 	r = gettemp();
419 	if (a[1] != NULL) {		/* getline < file */
420 		x = execute(a[2]);		/* filename */
421 		mode = ptoi(a[1]);
422 		if (mode == '|')		/* input pipe */
423 			mode = LE;	/* arbitrary flag */
424 		fp = openfile(mode, getsval(x), &newflag);
425 		tempfree(x);
426 		if (fp == NULL)
427 			n = -1;
428 		else
429 			n = readrec(&buf, &bufsize, fp, newflag);
430 		if (n <= 0) {
431 			;
432 		} else if (a[0] != NULL) {	/* getline var <file */
433 			x = execute(a[0]);
434 			setsval(x, buf);
435 			if (is_number(x->sval, & result)) {
436 				x->fval = result;
437 				x->tval |= NUM;
438 			}
439 			tempfree(x);
440 		} else {			/* getline <file */
441 			setsval(fldtab[0], buf);
442 			if (is_number(fldtab[0]->sval, & result)) {
443 				fldtab[0]->fval = result;
444 				fldtab[0]->tval |= NUM;
445 			}
446 		}
447 	} else {			/* bare getline; use current input */
448 		if (a[0] == NULL)	/* getline */
449 			n = getrec(&record, &recsize, true);
450 		else {			/* getline var */
451 			n = getrec(&buf, &bufsize, false);
452 			if (n > 0) {
453 				x = execute(a[0]);
454 				setsval(x, buf);
455 				if (is_number(x->sval, & result)) {
456 					x->fval = result;
457 					x->tval |= NUM;
458 				}
459 				tempfree(x);
460 			}
461 		}
462 	}
463 	setfval(r, (Awkfloat) n);
464 	free(buf);
465 	return r;
466 }
467 
getnf(Node ** a,int n)468 Cell *getnf(Node **a, int n)	/* get NF */
469 {
470 	if (!donefld)
471 		fldbld();
472 	return (Cell *) a[0];
473 }
474 
475 static char *
makearraystring(Node * p,const char * func)476 makearraystring(Node *p, const char *func)
477 {
478 	char *buf;
479 	int bufsz = recsize;
480 	size_t blen;
481 
482 	if ((buf = (char *) malloc(bufsz)) == NULL) {
483 		FATAL("%s: out of memory", func);
484 	}
485 
486 	blen = 0;
487 	buf[blen] = '\0';
488 
489 	for (; p; p = p->nnext) {
490 		Cell *x = execute(p);	/* expr */
491 		char *s = getsval(x);
492 		size_t seplen = strlen(getsval(subseploc));
493 		size_t nsub = p->nnext ? seplen : 0;
494 		size_t slen = strlen(s);
495 		size_t tlen = blen + slen + nsub;
496 
497 		if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
498 			FATAL("%s: out of memory %s[%s...]",
499 			    func, x->nval, buf);
500 		}
501 		memcpy(buf + blen, s, slen);
502 		if (nsub) {
503 			memcpy(buf + blen + slen, *SUBSEP, nsub);
504 		}
505 		buf[tlen] = '\0';
506 		blen = tlen;
507 		tempfree(x);
508 	}
509 	return buf;
510 }
511 
array(Node ** a,int n)512 Cell *array(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
513 {
514 	Cell *x, *z;
515 	char *buf;
516 
517 	x = execute(a[0]);	/* Cell* for symbol table */
518 	buf = makearraystring(a[1], __func__);
519 	if (!isarr(x)) {
520 		DPRINTF("making %s into an array\n", NN(x->nval));
521 		if (freeable(x))
522 			xfree(x->sval);
523 		x->tval &= ~(STR|NUM|DONTFREE);
524 		x->tval |= ARR;
525 		x->sval = (char *) makesymtab(NSYMTAB);
526 	}
527 	z = setsymtab(buf, "", 0.0, STR|NUM, (Array *) x->sval);
528 	z->ctype = OCELL;
529 	z->csub = CVAR;
530 	tempfree(x);
531 	free(buf);
532 	return(z);
533 }
534 
awkdelete(Node ** a,int n)535 Cell *awkdelete(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
536 {
537 	Cell *x;
538 
539 	x = execute(a[0]);	/* Cell* for symbol table */
540 	if (x == symtabloc) {
541 		FATAL("cannot delete SYMTAB or its elements");
542 	}
543 	if (!isarr(x))
544 		return True;
545 	if (a[1] == NULL) {	/* delete the elements, not the table */
546 		freesymtab(x);
547 		x->tval &= ~STR;
548 		x->tval |= ARR;
549 		x->sval = (char *) makesymtab(NSYMTAB);
550 	} else {
551 		char *buf = makearraystring(a[1], __func__);
552 		freeelem(x, buf);
553 		free(buf);
554 	}
555 	tempfree(x);
556 	return True;
557 }
558 
intest(Node ** a,int n)559 Cell *intest(Node **a, int n)	/* a[0] is index (list), a[1] is symtab */
560 {
561 	Cell *ap, *k;
562 	char *buf;
563 
564 	ap = execute(a[1]);	/* array name */
565 	if (!isarr(ap)) {
566 		DPRINTF("making %s into an array\n", ap->nval);
567 		if (freeable(ap))
568 			xfree(ap->sval);
569 		ap->tval &= ~(STR|NUM|DONTFREE);
570 		ap->tval |= ARR;
571 		ap->sval = (char *) makesymtab(NSYMTAB);
572 	}
573 	buf = makearraystring(a[0], __func__);
574 	k = lookup(buf, (Array *) ap->sval);
575 	tempfree(ap);
576 	free(buf);
577 	if (k == NULL)
578 		return(False);
579 	else
580 		return(True);
581 }
582 
583 
584 /* ======== utf-8 code ========== */
585 
586 /*
587  * Awk strings can contain ascii, random 8-bit items (eg Latin-1),
588  * or utf-8.  u8_isutf tests whether a string starts with a valid
589  * utf-8 sequence, and returns 0 if not (e.g., high bit set).
590  * u8_nextlen returns length of next valid sequence, which is
591  * 1 for ascii, 2..4 for utf-8, or 1 for high bit non-utf.
592  * u8_strlen returns length of string in valid utf-8 sequences
593  * and/or high-bit bytes.  Conversion functions go between byte
594  * number and character number.
595  *
596  * In theory, this behaves the same as before for non-utf8 bytes.
597  *
598  * Limited checking! This is a potential security hole.
599  */
600 
601 /* is s the beginning of a valid utf-8 string? */
602 /* return length 1..4 if yes, 0 if no */
u8_isutf(const char * s)603 int u8_isutf(const char *s)
604 {
605 	int n, ret;
606 	unsigned char c;
607 
608 	c = s[0];
609 	if (c < 128 || awk_mb_cur_max == 1)
610 		return 1; /* what if it's 0? */
611 
612 	n = strlen(s);
613 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
614 		ret = 2; /* 110xxxxx 10xxxxxx */
615 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
616 			 && (s[2] & 0xC0) == 0x80) {
617 		ret = 3; /* 1110xxxx 10xxxxxx 10xxxxxx */
618 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
619 			 && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
620 		ret = 4; /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
621 	} else {
622 		ret = 0;
623 	}
624 	return ret;
625 }
626 
627 /* Convert (prefix of) utf8 string to utf-32 rune. */
628 /* Sets *rune to the value, returns the length. */
629 /* No error checking: watch out. */
u8_rune(int * rune,const char * s)630 int u8_rune(int *rune, const char *s)
631 {
632 	int n, ret;
633 	unsigned char c;
634 
635 	c = s[0];
636 	if (c < 128 || awk_mb_cur_max == 1) {
637 		*rune = c;
638 		return 1;
639 	}
640 
641 	n = strlen(s);
642 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
643 		*rune = ((c & 0x1F) << 6) | (s[1] & 0x3F); /* 110xxxxx 10xxxxxx */
644 		ret = 2;
645 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
646 			  && (s[2] & 0xC0) == 0x80) {
647 		*rune = ((c & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
648 			/* 1110xxxx 10xxxxxx 10xxxxxx */
649 		ret = 3;
650 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
651 			  && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
652 		*rune = ((c & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
653 			/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
654 		ret = 4;
655 	} else {
656 		*rune = c;
657 		ret = 1;
658 	}
659 	return ret; /* returns one byte if sequence doesn't look like utf */
660 }
661 
662 /* return length of next sequence: 1 for ascii or random, 2..4 for valid utf8 */
u8_nextlen(const char * s)663 int u8_nextlen(const char *s)
664 {
665 	int len;
666 
667 	len = u8_isutf(s);
668 	if (len == 0)
669 		len = 1;
670 	return len;
671 }
672 
673 /* return number of utf characters or single non-utf bytes */
u8_strlen(const char * s)674 int u8_strlen(const char *s)
675 {
676 	int i, len, n, totlen;
677 	unsigned char c;
678 
679 	n = strlen(s);
680 	totlen = 0;
681 	for (i = 0; i < n; i += len) {
682 		c = s[i];
683 		if (c < 128 || awk_mb_cur_max == 1) {
684 			len = 1;
685 		} else {
686 			len = u8_nextlen(&s[i]);
687 		}
688 		totlen++;
689 		if (i > n)
690 			FATAL("bad utf count [%s] n=%d i=%d\n", s, n, i);
691 	}
692 	return totlen;
693 }
694 
695 /* convert utf-8 char number in a string to its byte offset */
u8_char2byte(const char * s,int charnum)696 int u8_char2byte(const char *s, int charnum)
697 {
698 	int n;
699 	int bytenum = 0;
700 
701 	while (charnum > 0) {
702 		n = u8_nextlen(s);
703 		s += n;
704 		bytenum += n;
705 		charnum--;
706 	}
707 	return bytenum;
708 }
709 
710 /* convert byte offset in s to utf-8 char number that starts there */
u8_byte2char(const char * s,int bytenum)711 int u8_byte2char(const char *s, int bytenum)
712 {
713 	int i, len, b;
714 	int charnum = 0; /* BUG: what origin? */
715 	/* should be 0 to match start==0 which means no match */
716 
717 	b = strlen(s);
718 	if (bytenum > b) {
719 		return -1; /* ??? */
720 	}
721 	for (i = 0; i <= bytenum; i += len) {
722 		len = u8_nextlen(s+i);
723 		charnum++;
724 	}
725 	return charnum;
726 }
727 
728 /* runetochar() adapted from rune.c in the Plan 9 distribution */
729 
730 enum
731 {
732 	Runeerror = 128, /* from somewhere else */
733 	Runemax = 0x10FFFF,
734 
735 	Bit1    = 7,
736 	Bitx    = 6,
737 	Bit2    = 5,
738 	Bit3    = 4,
739 	Bit4    = 3,
740 	Bit5    = 2,
741 
742 	T1      = ((1<<(Bit1+1))-1) ^ 0xFF,     /* 0000 0000 */
743 	Tx      = ((1<<(Bitx+1))-1) ^ 0xFF,     /* 1000 0000 */
744 	T2      = ((1<<(Bit2+1))-1) ^ 0xFF,     /* 1100 0000 */
745 	T3      = ((1<<(Bit3+1))-1) ^ 0xFF,     /* 1110 0000 */
746 	T4      = ((1<<(Bit4+1))-1) ^ 0xFF,     /* 1111 0000 */
747 	T5      = ((1<<(Bit5+1))-1) ^ 0xFF,     /* 1111 1000 */
748 
749 	Rune1   = (1<<(Bit1+0*Bitx))-1,	 	/* 0000 0000 0000 0000 0111 1111 */
750 	Rune2   = (1<<(Bit2+1*Bitx))-1,	 	/* 0000 0000 0000 0111 1111 1111 */
751 	Rune3   = (1<<(Bit3+2*Bitx))-1,	 	/* 0000 0000 1111 1111 1111 1111 */
752 	Rune4   = (1<<(Bit4+3*Bitx))-1,	 	/* 0011 1111 1111 1111 1111 1111 */
753 
754 	Maskx   = (1<<Bitx)-1,		  	/* 0011 1111 */
755 	Testx   = Maskx ^ 0xFF,		 	/* 1100 0000 */
756 
757 };
758 
runetochar(char * str,int c)759 int runetochar(char *str, int c)
760 {
761 	/* one character sequence 00000-0007F => 00-7F */
762 	if (c <= Rune1) {
763 		str[0] = c;
764 		return 1;
765 	}
766 
767 	/* two character sequence 00080-007FF => T2 Tx */
768 	if (c <= Rune2) {
769 		str[0] = T2 | (c >> 1*Bitx);
770 		str[1] = Tx | (c & Maskx);
771 		return 2;
772 	}
773 
774 	/* three character sequence 00800-0FFFF => T3 Tx Tx */
775 	if (c > Runemax)
776 		c = Runeerror;
777 	if (c <= Rune3) {
778 		str[0] = T3 |  (c >> 2*Bitx);
779 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
780 		str[2] = Tx |  (c & Maskx);
781 		return 3;
782 	}
783 
784 	/* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */
785 	str[0] = T4 |  (c >> 3*Bitx);
786 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
787 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
788 	str[3] = Tx |  (c & Maskx);
789 	return 4;
790 }
791 
792 
793 /* ========== end of utf8 code =========== */
794 
795 
796 
matchop(Node ** a,int n)797 Cell *matchop(Node **a, int n)	/* ~ and match() */
798 {
799 	Cell *x, *y, *z;
800 	char *s, *t;
801 	int i;
802 	int cstart, cpatlen, len;
803 	fa *pfa;
804 	int (*mf)(fa *, const char *) = match, mode = 0;
805 
806 	if (n == MATCHFCN) {
807 		mf = pmatch;
808 		mode = 1;
809 	}
810 	x = execute(a[1]);	/* a[1] = target text */
811 	s = getsval(x);
812 	if (a[0] == NULL)	/* a[1] == 0: already-compiled reg expr */
813 		i = (*mf)((fa *) a[2], s);
814 	else {
815 		y = execute(a[2]);	/* a[2] = regular expr */
816 		t = getsval(y);
817 		pfa = makedfa(t, mode);
818 		i = (*mf)(pfa, s);
819 		tempfree(y);
820 	}
821 	z = x;
822 	if (n == MATCHFCN) {
823 		int start = patbeg - s + 1; /* origin 1 */
824 		if (patlen < 0) {
825 			start = 0; /* not found */
826 		} else {
827 			cstart = u8_byte2char(s, start-1);
828 			cpatlen = 0;
829 			for (i = 0; i < patlen; i += len) {
830 				len = u8_nextlen(patbeg+i);
831 				cpatlen++;
832 			}
833 
834 			start = cstart;
835 			patlen = cpatlen;
836 		}
837 
838 		setfval(rstartloc, (Awkfloat) start);
839 		setfval(rlengthloc, (Awkfloat) patlen);
840 		x = gettemp();
841 		x->tval = NUM;
842 		x->fval = start;
843 	} else if ((n == MATCH && i == 1) || (n == NOTMATCH && i == 0))
844 		x = True;
845 	else
846 		x = False;
847 
848 	tempfree(z);
849 	return x;
850 }
851 
852 
boolop(Node ** a,int n)853 Cell *boolop(Node **a, int n)	/* a[0] || a[1], a[0] && a[1], !a[0] */
854 {
855 	Cell *x, *y;
856 	int i;
857 
858 	x = execute(a[0]);
859 	i = istrue(x);
860 	tempfree(x);
861 	switch (n) {
862 	case BOR:
863 		if (i) return(True);
864 		y = execute(a[1]);
865 		i = istrue(y);
866 		tempfree(y);
867 		if (i) return(True);
868 		else return(False);
869 	case AND:
870 		if ( !i ) return(False);
871 		y = execute(a[1]);
872 		i = istrue(y);
873 		tempfree(y);
874 		if (i) return(True);
875 		else return(False);
876 	case NOT:
877 		if (i) return(False);
878 		else return(True);
879 	default:	/* can't happen */
880 		FATAL("unknown boolean operator %d", n);
881 	}
882 	return 0;	/*NOTREACHED*/
883 }
884 
relop(Node ** a,int n)885 Cell *relop(Node **a, int n)	/* a[0 < a[1], etc. */
886 {
887 	int i;
888 	Cell *x, *y;
889 	Awkfloat j;
890 	bool x_is_nan, y_is_nan;
891 
892 	x = execute(a[0]);
893 	y = execute(a[1]);
894 	x_is_nan = isnan(x->fval);
895 	y_is_nan = isnan(y->fval);
896 	if (x->tval&NUM && y->tval&NUM) {
897 		if ((x_is_nan || y_is_nan) && n != NE)
898 			return(False);
899 		j = x->fval - y->fval;
900 		i = j<0? -1: (j>0? 1: 0);
901 	} else {
902 		i = strcmp(getsval(x), getsval(y));
903 	}
904 	tempfree(x);
905 	tempfree(y);
906 	switch (n) {
907 	case LT:	if (i<0) return(True);
908 			else return(False);
909 	case LE:	if (i<=0) return(True);
910 			else return(False);
911 	case NE:	if (x_is_nan && y_is_nan) return(True);
912 			else if (i!=0) return(True);
913 			else return(False);
914 	case EQ:	if (i == 0) return(True);
915 			else return(False);
916 	case GE:	if (i>=0) return(True);
917 			else return(False);
918 	case GT:	if (i>0) return(True);
919 			else return(False);
920 	default:	/* can't happen */
921 		FATAL("unknown relational operator %d", n);
922 	}
923 	return 0;	/*NOTREACHED*/
924 }
925 
tfree(Cell * a)926 void tfree(Cell *a)	/* free a tempcell */
927 {
928 	if (freeable(a)) {
929 		DPRINTF("freeing %s %s %o\n", NN(a->nval), NN(a->sval), a->tval);
930 		xfree(a->sval);
931 	}
932 	if (a == tmps)
933 		FATAL("tempcell list is curdled");
934 	a->cnext = tmps;
935 	tmps = a;
936 }
937 
gettemp(void)938 Cell *gettemp(void)	/* get a tempcell */
939 {	int i;
940 	Cell *x;
941 
942 	if (!tmps) {
943 		tmps = (Cell *) calloc(100, sizeof(*tmps));
944 		if (!tmps)
945 			FATAL("out of space for temporaries");
946 		for (i = 1; i < 100; i++)
947 			tmps[i-1].cnext = &tmps[i];
948 		tmps[i-1].cnext = NULL;
949 	}
950 	x = tmps;
951 	tmps = x->cnext;
952 	*x = tempcell;
953 	return(x);
954 }
955 
indirect(Node ** a,int n)956 Cell *indirect(Node **a, int n)	/* $( a[0] ) */
957 {
958 	Awkfloat val;
959 	Cell *x;
960 	int m;
961 
962 	x = execute(a[0]);
963 	val = getfval(x);	/* freebsd: defend against super large field numbers */
964 	if ((Awkfloat)INT_MAX < val)
965 		FATAL("trying to access out of range field %s", x->nval);
966 	m = (int) val;
967 	tempfree(x);
968 	x = fieldadr(m);
969 	x->ctype = OCELL;	/* BUG?  why are these needed? */
970 	x->csub = CFLD;
971 	return(x);
972 }
973 
substr(Node ** a,int nnn)974 Cell *substr(Node **a, int nnn)		/* substr(a[0], a[1], a[2]) */
975 {
976 	int k, m, n;
977 	int mb, nb;
978 	char *s;
979 	int temp;
980 	Cell *x, *y, *z = NULL;
981 
982 	x = execute(a[0]);
983 	y = execute(a[1]);
984 	if (a[2] != NULL)
985 		z = execute(a[2]);
986 	s = getsval(x);
987 	k = u8_strlen(s) + 1;
988 	if (k <= 1) {
989 		tempfree(x);
990 		tempfree(y);
991 		if (a[2] != NULL) {
992 			tempfree(z);
993 		}
994 		x = gettemp();
995 		setsval(x, "");
996 		return(x);
997 	}
998 	m = (int) getfval(y);
999 	if (m <= 0)
1000 		m = 1;
1001 	else if (m > k)
1002 		m = k;
1003 	tempfree(y);
1004 	if (a[2] != NULL) {
1005 		n = (int) getfval(z);
1006 		tempfree(z);
1007 	} else
1008 		n = k - 1;
1009 	if (n < 0)
1010 		n = 0;
1011 	else if (n > k - m)
1012 		n = k - m;
1013 	/* m is start, n is length from there */
1014 	DPRINTF("substr: m=%d, n=%d, s=%s\n", m, n, s);
1015 	y = gettemp();
1016 	mb = u8_char2byte(s, m-1); /* byte offset of start char in s */
1017 	nb = u8_char2byte(s, m-1+n);  /* byte offset of end+1 char in s */
1018 
1019 	temp = s[nb];	/* with thanks to John Linderman */
1020 	s[nb] = '\0';
1021 	setsval(y, s + mb);
1022 	s[nb] = temp;
1023 	tempfree(x);
1024 	return(y);
1025 }
1026 
sindex(Node ** a,int nnn)1027 Cell *sindex(Node **a, int nnn)		/* index(a[0], a[1]) */
1028 {
1029 	Cell *x, *y, *z;
1030 	char *s1, *s2, *p1, *p2, *q;
1031 	Awkfloat v = 0.0;
1032 
1033 	x = execute(a[0]);
1034 	s1 = getsval(x);
1035 	y = execute(a[1]);
1036 	s2 = getsval(y);
1037 
1038 	z = gettemp();
1039 	for (p1 = s1; *p1 != '\0'; p1++) {
1040 		for (q = p1, p2 = s2; *p2 != '\0' && *q == *p2; q++, p2++)
1041 			continue;
1042 		if (*p2 == '\0') {
1043 			/* v = (Awkfloat) (p1 - s1 + 1);	 origin 1 */
1044 
1045 		   /* should be a function: used in match() as well */
1046 			int i, len;
1047 			v = 0;
1048 			for (i = 0; i < p1-s1+1; i += len) {
1049 				len = u8_nextlen(s1+i);
1050 				v++;
1051 			}
1052 			break;
1053 		}
1054 	}
1055 	tempfree(x);
1056 	tempfree(y);
1057 	setfval(z, v);
1058 	return(z);
1059 }
1060 
has_utf8(char * s)1061 int has_utf8(char *s)	/* return 1 if s contains any utf-8 (2 bytes or more) character */
1062 {
1063 	int n;
1064 
1065 	for (n = 0; *s != 0; s += n) {
1066 		n = u8_nextlen(s);
1067 		if (n > 1)
1068 			return 1;
1069 	}
1070 	return 0;
1071 }
1072 
1073 #define	MAXNUMSIZE	50
1074 
format(char ** pbuf,int * pbufsize,const char * s,Node * a)1075 int format(char **pbuf, int *pbufsize, const char *s, Node *a)	/* printf-like conversions */
1076 {
1077 	char *fmt;
1078 	char *p, *t;
1079 	const char *os;
1080 	Cell *x;
1081 	int flag = 0, n;
1082 	int fmtwd; /* format width */
1083 	int fmtsz = recsize;
1084 	char *buf = *pbuf;
1085 	int bufsize = *pbufsize;
1086 #define FMTSZ(a)   (fmtsz - ((a) - fmt))
1087 #define BUFSZ(a)   (bufsize - ((a) - buf))
1088 
1089 	static bool first = true;
1090 	static bool have_a_format = false;
1091 
1092 	if (first) {
1093 		char xbuf[100];
1094 
1095 		snprintf(xbuf, sizeof(xbuf), "%a", 42.0);
1096 		have_a_format = (strcmp(xbuf, "0x1.5p+5") == 0);
1097 		first = false;
1098 	}
1099 
1100 	os = s;
1101 	p = buf;
1102 	if ((fmt = (char *) malloc(fmtsz)) == NULL)
1103 		FATAL("out of memory in format()");
1104 	while (*s) {
1105 		adjbuf(&buf, &bufsize, MAXNUMSIZE+1+p-buf, recsize, &p, "format1");
1106 		if (*s != '%') {
1107 			*p++ = *s++;
1108 			continue;
1109 		}
1110 		if (*(s+1) == '%') {
1111 			*p++ = '%';
1112 			s += 2;
1113 			continue;
1114 		}
1115 		fmtwd = atoi(s+1);
1116 		if (fmtwd < 0)
1117 			fmtwd = -fmtwd;
1118 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format2");
1119 		for (t = fmt; (*t++ = *s) != '\0'; s++) {
1120 			if (!adjbuf(&fmt, &fmtsz, MAXNUMSIZE+1+t-fmt, recsize, &t, "format3"))
1121 				FATAL("format item %.30s... ran format() out of memory", os);
1122 			/* Ignore size specifiers */
1123 			if (strchr("hjLlqtz", *s) != NULL) {	/* the ansi panoply */
1124 				t--;
1125 				continue;
1126 			}
1127 			if (isalpha((uschar)*s))
1128 				break;
1129 			if (*s == '$') {
1130 				FATAL("'$' not permitted in awk formats");
1131 			}
1132 			if (*s == '*') {
1133 				if (a == NULL) {
1134 					FATAL("not enough args in printf(%s)", os);
1135 				}
1136 				x = execute(a);
1137 				a = a->nnext;
1138 				snprintf(t - 1, FMTSZ(t - 1),
1139 				    "%d", fmtwd=(int) getfval(x));
1140 				if (fmtwd < 0)
1141 					fmtwd = -fmtwd;
1142 				adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format");
1143 				t = fmt + strlen(fmt);
1144 				tempfree(x);
1145 			}
1146 		}
1147 		*t = '\0';
1148 		if (fmtwd < 0)
1149 			fmtwd = -fmtwd;
1150 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format4");
1151 		switch (*s) {
1152 		case 'a': case 'A':
1153 			if (have_a_format)
1154 				flag = *s;
1155 			else
1156 				flag = 'f';
1157 			break;
1158 		case 'f': case 'e': case 'g': case 'E': case 'G':
1159 			flag = 'f';
1160 			break;
1161 		case 'd': case 'i': case 'o': case 'x': case 'X': case 'u':
1162 			flag = (*s == 'd' || *s == 'i') ? 'd' : 'u';
1163 			*(t-1) = 'j';
1164 			*t = *s;
1165 			*++t = '\0';
1166 			break;
1167 		case 's':
1168 			flag = 's';
1169 			break;
1170 		case 'c':
1171 			flag = 'c';
1172 			break;
1173 		default:
1174 			WARNING("weird printf conversion %s", fmt);
1175 			flag = '?';
1176 			break;
1177 		}
1178 		if (a == NULL)
1179 			FATAL("not enough args in printf(%s)", os);
1180 		x = execute(a);
1181 		a = a->nnext;
1182 		n = MAXNUMSIZE;
1183 		if (fmtwd > n)
1184 			n = fmtwd;
1185 		adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format5");
1186 		switch (flag) {
1187 		case '?':
1188 			snprintf(p, BUFSZ(p), "%s", fmt);	/* unknown, so dump it too */
1189 			t = getsval(x);
1190 			n = strlen(t);
1191 			if (fmtwd > n)
1192 				n = fmtwd;
1193 			adjbuf(&buf, &bufsize, 1+strlen(p)+n+p-buf, recsize, &p, "format6");
1194 			p += strlen(p);
1195 			snprintf(p, BUFSZ(p), "%s", t);
1196 			break;
1197 		case 'a':
1198 		case 'A':
1199 		case 'f':	snprintf(p, BUFSZ(p), fmt, getfval(x)); break;
1200 		case 'd':	snprintf(p, BUFSZ(p), fmt, (intmax_t) getfval(x)); break;
1201 		case 'u':	snprintf(p, BUFSZ(p), fmt, (uintmax_t) getfval(x)); break;
1202 
1203 		case 's': {
1204 			t = getsval(x);
1205 			n = strlen(t);
1206 			/* if simple format or no utf-8 in the string, sprintf works */
1207 			if (!has_utf8(t) || strcmp(fmt,"%s") == 0) {
1208 				if (fmtwd > n)
1209 					n = fmtwd;
1210 				if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7"))
1211 					FATAL("huge string/format (%d chars) in printf %.30s..." \
1212 						" ran format() out of memory", n, t);
1213 				snprintf(p, BUFSZ(p), fmt, t);
1214 				break;
1215 			}
1216 
1217 			/* get here if string has utf-8 chars and fmt is not plain %s */
1218 			/* "%-w.ps", where -, w and .p are all optional */
1219 			/* '0' before the w is a flag character */
1220 			/* fmt points at % */
1221 			int ljust = 0, wid = 0, prec = n, pad = 0;
1222 			char *f = fmt+1;
1223 			if (f[0] == '-') {
1224 				ljust = 1;
1225 				f++;
1226 			}
1227 			// flags '0' and '+' are recognized but skipped
1228 			if (f[0] == '0') {
1229 				f++;
1230 				if (f[0] == '+')
1231 					f++;
1232 			}
1233 			if (f[0] == '+') {
1234 				f++;
1235 				if (f[0] == '0')
1236 					f++;
1237 			}
1238 			if (isdigit(f[0])) { /* there is a wid */
1239 				wid = strtol(f, &f, 10);
1240 			}
1241 			if (f[0] == '.') { /* there is a .prec */
1242 				prec = strtol(++f, &f, 10);
1243 			}
1244 			if (prec > u8_strlen(t))
1245 				prec = u8_strlen(t);
1246 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1247 			int i, k, n;
1248 
1249 			if (ljust) { // print prec chars from t, then pad blanks
1250 				n = u8_char2byte(t, prec);
1251 				for (k = 0; k < n; k++) {
1252 					//putchar(t[k]);
1253 					*p++ = t[k];
1254 				}
1255 				for (i = 0; i < pad; i++) {
1256 					//printf(" ");
1257 					*p++ = ' ';
1258 				}
1259 			} else { // print pad blanks, then prec chars from t
1260 				for (i = 0; i < pad; i++) {
1261 					//printf(" ");
1262 					*p++ = ' ';
1263 				}
1264 				n = u8_char2byte(t, prec);
1265 				for (k = 0; k < n; k++) {
1266 					//putchar(t[k]);
1267 					*p++ = t[k];
1268 				}
1269 			}
1270 			*p = 0;
1271 			break;
1272 		}
1273 
1274                case 'c': {
1275 			/*
1276 			 * If a numeric value is given, awk should just turn
1277 			 * it into a character and print it:
1278 			 *      BEGIN { printf("%c\n", 65) }
1279 			 * prints "A".
1280 			 *
1281 			 * But what if the numeric value is > 128 and
1282 			 * represents a valid Unicode code point?!? We do
1283 			 * our best to convert it back into UTF-8. If we
1284 			 * can't, we output the encoding of the Unicode
1285 			 * "invalid character", 0xFFFD.
1286 			 */
1287 			if (isnum(x)) {
1288 				int charval = (int) getfval(x);
1289 
1290 				if (charval != 0) {
1291 					if (charval < 128 || awk_mb_cur_max == 1)
1292 						snprintf(p, BUFSZ(p), fmt, charval);
1293 					else {
1294 						// possible unicode character
1295 						size_t count;
1296 						char *bs = wide_char_to_byte_str(charval, &count);
1297 
1298 						if (bs == NULL)	{ // invalid character
1299 							// use unicode invalid character, 0xFFFD
1300 							static char invalid_char[] = "\357\277\275";
1301 							bs = invalid_char;
1302 							count = 3;
1303 						}
1304 						t = bs;
1305 						n = count;
1306 						goto format_percent_c;
1307 					}
1308 				} else {
1309 					*p++ = '\0'; /* explicit null byte */
1310 					*p = '\0';   /* next output will start here */
1311 				}
1312 				break;
1313 			}
1314 			t = getsval(x);
1315 			n = u8_nextlen(t);
1316 		format_percent_c:
1317 			if (n < 2) { /* not utf8 */
1318 				snprintf(p, BUFSZ(p), fmt, getsval(x)[0]);
1319 				break;
1320 			}
1321 
1322 			// utf8 character, almost same song and dance as for %s
1323 			int ljust = 0, wid = 0, prec = n, pad = 0;
1324 			char *f = fmt+1;
1325 			if (f[0] == '-') {
1326 				ljust = 1;
1327 				f++;
1328 			}
1329 			// flags '0' and '+' are recognized but skipped
1330 			if (f[0] == '0') {
1331 				f++;
1332 				if (f[0] == '+')
1333 					f++;
1334 			}
1335 			if (f[0] == '+') {
1336 				f++;
1337 				if (f[0] == '0')
1338 					f++;
1339 			}
1340 			if (isdigit(f[0])) { /* there is a wid */
1341 				wid = strtol(f, &f, 10);
1342 			}
1343 			if (f[0] == '.') { /* there is a .prec */
1344 				prec = strtol(++f, &f, 10);
1345 			}
1346 			if (prec > 1)           // %c --> only one character
1347 				prec = 1;
1348 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1349 			int i;
1350 
1351 			if (ljust) { // print one char from t, then pad blanks
1352 				for (i = 0; i < n; i++)
1353 					*p++ = t[i];
1354 				for (i = 0; i < pad; i++) {
1355 					//printf(" ");
1356 					*p++ = ' ';
1357 				}
1358 			} else { // print pad blanks, then prec chars from t
1359 				for (i = 0; i < pad; i++) {
1360 					//printf(" ");
1361 					*p++ = ' ';
1362 				}
1363 				for (i = 0; i < n; i++)
1364 					*p++ = t[i];
1365 			}
1366 			*p = 0;
1367 			break;
1368 		}
1369 		default:
1370 			FATAL("can't happen: bad conversion %c in format()", flag);
1371 		}
1372 
1373 		tempfree(x);
1374 		p += strlen(p);
1375 		s++;
1376 	}
1377 	*p = '\0';
1378 	free(fmt);
1379 	for ( ; a; a = a->nnext) {		/* evaluate any remaining args */
1380 		x = execute(a);
1381 		tempfree(x);
1382 	}
1383 	*pbuf = buf;
1384 	*pbufsize = bufsize;
1385 	return p - buf;
1386 }
1387 
awksprintf(Node ** a,int n)1388 Cell *awksprintf(Node **a, int n)		/* sprintf(a[0]) */
1389 {
1390 	Cell *x;
1391 	Node *y;
1392 	char *buf;
1393 	int bufsz=3*recsize;
1394 
1395 	if ((buf = (char *) malloc(bufsz)) == NULL)
1396 		FATAL("out of memory in awksprintf");
1397 	y = a[0]->nnext;
1398 	x = execute(a[0]);
1399 	if (format(&buf, &bufsz, getsval(x), y) == -1)
1400 		FATAL("sprintf string %.30s... too long.  can't happen.", buf);
1401 	tempfree(x);
1402 	x = gettemp();
1403 	x->sval = buf;
1404 	x->tval = STR;
1405 	return(x);
1406 }
1407 
awkprintf(Node ** a,int n)1408 Cell *awkprintf(Node **a, int n)		/* printf */
1409 {	/* a[0] is list of args, starting with format string */
1410 	/* a[1] is redirection operator, a[2] is redirection file */
1411 	FILE *fp;
1412 	Cell *x;
1413 	Node *y;
1414 	char *buf;
1415 	int len;
1416 	int bufsz=3*recsize;
1417 
1418 	if ((buf = (char *) malloc(bufsz)) == NULL)
1419 		FATAL("out of memory in awkprintf");
1420 	y = a[0]->nnext;
1421 	x = execute(a[0]);
1422 	if ((len = format(&buf, &bufsz, getsval(x), y)) == -1)
1423 		FATAL("printf string %.30s... too long.  can't happen.", buf);
1424 	tempfree(x);
1425 	if (a[1] == NULL) {
1426 		/* fputs(buf, stdout); */
1427 		fwrite(buf, len, 1, stdout);
1428 		if (ferror(stdout))
1429 			FATAL("write error on stdout");
1430 	} else {
1431 		fp = redirect(ptoi(a[1]), a[2]);
1432 		/* fputs(buf, fp); */
1433 		fwrite(buf, len, 1, fp);
1434 		fflush(fp);
1435 		if (ferror(fp))
1436 			FATAL("write error on %s", filename(fp));
1437 	}
1438 	free(buf);
1439 	return(True);
1440 }
1441 
arith(Node ** a,int n)1442 Cell *arith(Node **a, int n)	/* a[0] + a[1], etc.  also -a[0] */
1443 {
1444 	Awkfloat i, j = 0;
1445 	double v;
1446 	Cell *x, *y, *z;
1447 
1448 	x = execute(a[0]);
1449 	i = getfval(x);
1450 	tempfree(x);
1451 	if (n != UMINUS && n != UPLUS) {
1452 		y = execute(a[1]);
1453 		j = getfval(y);
1454 		tempfree(y);
1455 	}
1456 	z = gettemp();
1457 	switch (n) {
1458 	case ADD:
1459 		i += j;
1460 		break;
1461 	case MINUS:
1462 		i -= j;
1463 		break;
1464 	case MULT:
1465 		i *= j;
1466 		break;
1467 	case DIVIDE:
1468 		if (j == 0)
1469 			FATAL("division by zero");
1470 		i /= j;
1471 		break;
1472 	case MOD:
1473 		if (j == 0)
1474 			FATAL("division by zero in mod");
1475 		modf(i/j, &v);
1476 		i = i - j * v;
1477 		break;
1478 	case UMINUS:
1479 		i = -i;
1480 		break;
1481 	case UPLUS: /* handled by getfval(), above */
1482 		break;
1483 	case POWER:
1484 		if (j >= 0 && modf(j, &v) == 0.0)	/* pos integer exponent */
1485 			i = ipow(i, (int) j);
1486                else {
1487 			errno = 0;
1488 			i = errcheck(pow(i, j), "pow");
1489                }
1490 		break;
1491 	default:	/* can't happen */
1492 		FATAL("illegal arithmetic operator %d", n);
1493 	}
1494 	setfval(z, i);
1495 	return(z);
1496 }
1497 
ipow(double x,int n)1498 double ipow(double x, int n)	/* x**n.  ought to be done by pow, but isn't always */
1499 {
1500 	double v;
1501 
1502 	if (n <= 0)
1503 		return 1;
1504 	v = ipow(x, n/2);
1505 	if (n % 2 == 0)
1506 		return v * v;
1507 	else
1508 		return x * v * v;
1509 }
1510 
incrdecr(Node ** a,int n)1511 Cell *incrdecr(Node **a, int n)		/* a[0]++, etc. */
1512 {
1513 	Cell *x, *z;
1514 	int k;
1515 	Awkfloat xf;
1516 
1517 	x = execute(a[0]);
1518 	xf = getfval(x);
1519 	k = (n == PREINCR || n == POSTINCR) ? 1 : -1;
1520 	if (n == PREINCR || n == PREDECR) {
1521 		setfval(x, xf + k);
1522 		return(x);
1523 	}
1524 	z = gettemp();
1525 	setfval(z, xf);
1526 	setfval(x, xf + k);
1527 	tempfree(x);
1528 	return(z);
1529 }
1530 
assign(Node ** a,int n)1531 Cell *assign(Node **a, int n)	/* a[0] = a[1], a[0] += a[1], etc. */
1532 {		/* this is subtle; don't muck with it. */
1533 	Cell *x, *y;
1534 	Awkfloat xf, yf;
1535 	double v;
1536 
1537 	y = execute(a[1]);
1538 	x = execute(a[0]);
1539 	if (n == ASSIGN) {	/* ordinary assignment */
1540 		if (x == y && !(x->tval & (FLD|REC)) && x != nfloc)
1541 			;	/* self-assignment: leave alone unless it's a field or NF */
1542 		else if ((y->tval & (STR|NUM)) == (STR|NUM)) {
1543 			yf = getfval(y);
1544 			setsval(x, getsval(y));
1545 			x->fval = yf;
1546 			x->tval |= NUM;
1547 		}
1548 		else if (isstr(y))
1549 			setsval(x, getsval(y));
1550 		else if (isnum(y))
1551 			setfval(x, getfval(y));
1552 		else
1553 			funnyvar(y, "read value of");
1554 		tempfree(y);
1555 		return(x);
1556 	}
1557 	xf = getfval(x);
1558 	yf = getfval(y);
1559 	switch (n) {
1560 	case ADDEQ:
1561 		xf += yf;
1562 		break;
1563 	case SUBEQ:
1564 		xf -= yf;
1565 		break;
1566 	case MULTEQ:
1567 		xf *= yf;
1568 		break;
1569 	case DIVEQ:
1570 		if ((x->tval & CON) != 0)
1571 			FATAL("non-constant required for left side of /=");
1572 		if (yf == 0)
1573 			FATAL("division by zero in /=");
1574 		xf /= yf;
1575 		break;
1576 	case MODEQ:
1577 		if (yf == 0)
1578 			FATAL("division by zero in %%=");
1579 		modf(xf/yf, &v);
1580 		xf = xf - yf * v;
1581 		break;
1582 	case POWEQ:
1583 		if (yf >= 0 && modf(yf, &v) == 0.0)	/* pos integer exponent */
1584 			xf = ipow(xf, (int) yf);
1585                else {
1586 			errno = 0;
1587 			xf = errcheck(pow(xf, yf), "pow");
1588                }
1589 		break;
1590 	default:
1591 		FATAL("illegal assignment operator %d", n);
1592 		break;
1593 	}
1594 	tempfree(y);
1595 	setfval(x, xf);
1596 	return(x);
1597 }
1598 
cat(Node ** a,int q)1599 Cell *cat(Node **a, int q)	/* a[0] cat a[1] */
1600 {
1601 	Cell *x, *y, *z;
1602 	int n1, n2;
1603 	char *s = NULL;
1604 	int ssz = 0;
1605 
1606 	x = execute(a[0]);
1607 	n1 = strlen(getsval(x));
1608 	adjbuf(&s, &ssz, n1 + 1, recsize, 0, "cat1");
1609 	memcpy(s, x->sval, n1);
1610 
1611 	tempfree(x);
1612 
1613 	y = execute(a[1]);
1614 	n2 = strlen(getsval(y));
1615 	adjbuf(&s, &ssz, n1 + n2 + 1, recsize, 0, "cat2");
1616 	memcpy(s + n1, y->sval, n2);
1617 	s[n1 + n2] = '\0';
1618 
1619 	tempfree(y);
1620 
1621 	z = gettemp();
1622 	z->sval = s;
1623 	z->tval = STR;
1624 
1625 	return(z);
1626 }
1627 
pastat(Node ** a,int n)1628 Cell *pastat(Node **a, int n)	/* a[0] { a[1] } */
1629 {
1630 	Cell *x;
1631 
1632 	if (a[0] == NULL)
1633 		x = execute(a[1]);
1634 	else {
1635 		x = execute(a[0]);
1636 		if (istrue(x)) {
1637 			tempfree(x);
1638 			x = execute(a[1]);
1639 		}
1640 	}
1641 	return x;
1642 }
1643 
dopa2(Node ** a,int n)1644 Cell *dopa2(Node **a, int n)	/* a[0], a[1] { a[2] } */
1645 {
1646 	Cell *x;
1647 	int pair;
1648 
1649 	pair = ptoi(a[3]);
1650 	if (pairstack[pair] == 0) {
1651 		x = execute(a[0]);
1652 		if (istrue(x))
1653 			pairstack[pair] = 1;
1654 		tempfree(x);
1655 	}
1656 	if (pairstack[pair] == 1) {
1657 		x = execute(a[1]);
1658 		if (istrue(x))
1659 			pairstack[pair] = 0;
1660 		tempfree(x);
1661 		x = execute(a[2]);
1662 		return(x);
1663 	}
1664 	return(False);
1665 }
1666 
split(Node ** a,int nnn)1667 Cell *split(Node **a, int nnn)	/* split(a[0], a[1], a[2]); a[3] is type */
1668 {
1669 	Cell *x = NULL, *y, *ap;
1670 	const char *s, *origs, *t;
1671 	const char *fs = NULL;
1672 	char *origfs = NULL;
1673 	int sep;
1674 	char temp, num[50];
1675 	int n, tempstat, arg3type;
1676 	int j;
1677 	double result;
1678 
1679 	y = execute(a[0]);	/* source string */
1680 	origs = s = strdup(getsval(y));
1681 	tempfree(y);
1682 	arg3type = ptoi(a[3]);
1683 	if (a[2] == NULL) {		/* BUG: CSV should override implicit fs but not explicit */
1684 		fs = getsval(fsloc);
1685 	} else if (arg3type == STRING) {	/* split(str,arr,"string") */
1686 		x = execute(a[2]);
1687 		fs = origfs = strdup(getsval(x));
1688 		tempfree(x);
1689 	} else if (arg3type == REGEXPR) {
1690 		fs = "(regexpr)";	/* split(str,arr,/regexpr/) */
1691 	} else {
1692 		FATAL("illegal type of split");
1693 	}
1694 	sep = *fs;
1695 	ap = execute(a[1]);	/* array name */
1696 /* BUG 7/26/22: this appears not to reset array: see C1/asplit */
1697 	freesymtab(ap);
1698 	DPRINTF("split: s=|%s|, a=%s, sep=|%s|\n", s, NN(ap->nval), fs);
1699 	ap->tval &= ~STR;
1700 	ap->tval |= ARR;
1701 	ap->sval = (char *) makesymtab(NSYMTAB);
1702 
1703 	n = 0;
1704         if (arg3type == REGEXPR && strlen((char*)((fa*)a[2])->restr) == 0) {
1705 		/* split(s, a, //); have to arrange that it looks like empty sep */
1706 		arg3type = 0;
1707 		fs = "";
1708 		sep = 0;
1709 	}
1710 	if (*s != '\0' && (strlen(fs) > 1 || arg3type == REGEXPR)) {	/* reg expr */
1711 		fa *pfa;
1712 		if (arg3type == REGEXPR) {	/* it's ready already */
1713 			pfa = (fa *) a[2];
1714 		} else {
1715 			pfa = makedfa(fs, 1);
1716 		}
1717 		if (nematch(pfa,s)) {
1718 			tempstat = pfa->initstat;
1719 			pfa->initstat = 2;
1720 			do {
1721 				n++;
1722 				snprintf(num, sizeof(num), "%d", n);
1723 				temp = *patbeg;
1724 				setptr(patbeg, '\0');
1725 				if (is_number(s, & result))
1726 					setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1727 				else
1728 					setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1729 				setptr(patbeg, temp);
1730 				s = patbeg + patlen;
1731 				if (*(patbeg+patlen-1) == '\0' || *s == '\0') {
1732 					n++;
1733 					snprintf(num, sizeof(num), "%d", n);
1734 					setsymtab(num, "", 0.0, STR, (Array *) ap->sval);
1735 					pfa->initstat = tempstat;
1736 					goto spdone;
1737 				}
1738 			} while (nematch(pfa,s));
1739 			pfa->initstat = tempstat; 	/* bwk: has to be here to reset */
1740 							/* cf gsub and refldbld */
1741 		}
1742 		n++;
1743 		snprintf(num, sizeof(num), "%d", n);
1744 		if (is_number(s, & result))
1745 			setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1746 		else
1747 			setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1748   spdone:
1749 		pfa = NULL;
1750 
1751 	} else if (a[2] == NULL && CSV) {	/* CSV only if no explicit separator */
1752 		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
1753 		for (;;) {
1754 			char *fr = newt;
1755 			n++;
1756 			if (*s == '"' ) { /* start of "..." */
1757 				for (s++ ; *s != '\0'; ) {
1758 					if (*s == '"' && s[1] != '\0' && s[1] == '"') {
1759 						s += 2; /* doubled quote */
1760 						*fr++ = '"';
1761 					} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {
1762 						s++; /* skip over closing quote */
1763 						break;
1764 					} else {
1765 						*fr++ = *s++;
1766 					}
1767 				}
1768 				*fr++ = 0;
1769 			} else {	/* unquoted field */
1770 				while (*s != ',' && *s != '\0')
1771 					*fr++ = *s++;
1772 				*fr++ = 0;
1773 			}
1774 			snprintf(num, sizeof(num), "%d", n);
1775 			if (is_number(newt, &result))
1776 				setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
1777 			else
1778 				setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
1779 			if (*s++ == '\0')
1780 				break;
1781 		}
1782 		free(newt);
1783 
1784 	} else if (!CSV && sep == ' ') { /* usual case: split on white space */
1785 		for (n = 0; ; ) {
1786 #define ISWS(c)	((c) == ' ' || (c) == '\t' || (c) == '\n')
1787 			while (ISWS(*s))
1788 				s++;
1789 			if (*s == '\0')
1790 				break;
1791 			n++;
1792 			t = s;
1793 			do
1794 				s++;
1795 			while (*s != '\0' && !ISWS(*s));
1796 			temp = *s;
1797 			setptr(s, '\0');
1798 			snprintf(num, sizeof(num), "%d", n);
1799 			if (is_number(t, & result))
1800 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1801 			else
1802 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1803 			setptr(s, temp);
1804 			if (*s != '\0')
1805 				s++;
1806 		}
1807 
1808 	} else if (sep == 0) {	/* new: split(s, a, "") => 1 char/elem */
1809 		for (n = 0; *s != '\0'; s += u8_nextlen(s)) {
1810 			char buf[10];
1811 			n++;
1812 			snprintf(num, sizeof(num), "%d", n);
1813 
1814 			for (j = 0; j < u8_nextlen(s); j++) {
1815 				buf[j] = s[j];
1816 			}
1817 			buf[j] = '\0';
1818 
1819 			if (isdigit((uschar)buf[0]))
1820 				setsymtab(num, buf, atof(buf), STR|NUM, (Array *) ap->sval);
1821 			else
1822 				setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);
1823 		}
1824 
1825 	} else if (*s != '\0') {  /* some random single character */
1826 		for (;;) {
1827 			n++;
1828 			t = s;
1829 			while (*s != sep && *s != '\0')
1830 				s++;
1831 			temp = *s;
1832 			setptr(s, '\0');
1833 			snprintf(num, sizeof(num), "%d", n);
1834 			if (is_number(t, & result))
1835 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1836 			else
1837 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1838 			setptr(s, temp);
1839 			if (*s++ == '\0')
1840 				break;
1841 		}
1842 	}
1843 	tempfree(ap);
1844 	xfree(origs);
1845 	xfree(origfs);
1846 	x = gettemp();
1847 	x->tval = NUM;
1848 	x->fval = n;
1849 	return(x);
1850 }
1851 
condexpr(Node ** a,int n)1852 Cell *condexpr(Node **a, int n)	/* a[0] ? a[1] : a[2] */
1853 {
1854 	Cell *x;
1855 
1856 	x = execute(a[0]);
1857 	if (istrue(x)) {
1858 		tempfree(x);
1859 		x = execute(a[1]);
1860 	} else {
1861 		tempfree(x);
1862 		x = execute(a[2]);
1863 	}
1864 	return(x);
1865 }
1866 
ifstat(Node ** a,int n)1867 Cell *ifstat(Node **a, int n)	/* if (a[0]) a[1]; else a[2] */
1868 {
1869 	Cell *x;
1870 
1871 	x = execute(a[0]);
1872 	if (istrue(x)) {
1873 		tempfree(x);
1874 		x = execute(a[1]);
1875 	} else if (a[2] != NULL) {
1876 		tempfree(x);
1877 		x = execute(a[2]);
1878 	}
1879 	return(x);
1880 }
1881 
whilestat(Node ** a,int n)1882 Cell *whilestat(Node **a, int n)	/* while (a[0]) a[1] */
1883 {
1884 	Cell *x;
1885 
1886 	for (;;) {
1887 		x = execute(a[0]);
1888 		if (!istrue(x))
1889 			return(x);
1890 		tempfree(x);
1891 		x = execute(a[1]);
1892 		if (isbreak(x)) {
1893 			x = True;
1894 			return(x);
1895 		}
1896 		if (isnext(x) || isexit(x) || isret(x))
1897 			return(x);
1898 		tempfree(x);
1899 	}
1900 }
1901 
dostat(Node ** a,int n)1902 Cell *dostat(Node **a, int n)	/* do a[0]; while(a[1]) */
1903 {
1904 	Cell *x;
1905 
1906 	for (;;) {
1907 		x = execute(a[0]);
1908 		if (isbreak(x))
1909 			return True;
1910 		if (isnext(x) || isexit(x) || isret(x))
1911 			return(x);
1912 		tempfree(x);
1913 		x = execute(a[1]);
1914 		if (!istrue(x))
1915 			return(x);
1916 		tempfree(x);
1917 	}
1918 }
1919 
forstat(Node ** a,int n)1920 Cell *forstat(Node **a, int n)	/* for (a[0]; a[1]; a[2]) a[3] */
1921 {
1922 	Cell *x;
1923 
1924 	x = execute(a[0]);
1925 	tempfree(x);
1926 	for (;;) {
1927 		if (a[1]!=NULL) {
1928 			x = execute(a[1]);
1929 			if (!istrue(x)) return(x);
1930 			else tempfree(x);
1931 		}
1932 		x = execute(a[3]);
1933 		if (isbreak(x))		/* turn off break */
1934 			return True;
1935 		if (isnext(x) || isexit(x) || isret(x))
1936 			return(x);
1937 		tempfree(x);
1938 		x = execute(a[2]);
1939 		tempfree(x);
1940 	}
1941 }
1942 
instat(Node ** a,int n)1943 Cell *instat(Node **a, int n)	/* for (a[0] in a[1]) a[2] */
1944 {
1945 	Cell *x, *vp, *arrayp, *cp, *ncp;
1946 	Array *tp;
1947 	int i;
1948 
1949 	vp = execute(a[0]);
1950 	arrayp = execute(a[1]);
1951 	if (!isarr(arrayp)) {
1952 		return True;
1953 	}
1954 	tp = (Array *) arrayp->sval;
1955 	tempfree(arrayp);
1956 	for (i = 0; i < tp->size; i++) {	/* this routine knows too much */
1957 		for (cp = tp->tab[i]; cp != NULL; cp = ncp) {
1958 			setsval(vp, cp->nval);
1959 			ncp = cp->cnext;
1960 			x = execute(a[2]);
1961 			if (isbreak(x)) {
1962 				tempfree(vp);
1963 				return True;
1964 			}
1965 			if (isnext(x) || isexit(x) || isret(x)) {
1966 				tempfree(vp);
1967 				return(x);
1968 			}
1969 			tempfree(x);
1970 		}
1971 	}
1972 	return True;
1973 }
1974 
nawk_convert(const char * s,int (* fun_c)(int),wint_t (* fun_wc)(wint_t))1975 static char *nawk_convert(const char *s, int (*fun_c)(int),
1976     wint_t (*fun_wc)(wint_t))
1977 {
1978 	char *buf      = NULL;
1979 	char *pbuf     = NULL;
1980 	const char *ps = NULL;
1981 	size_t n       = 0;
1982 	wchar_t wc;
1983 	const size_t sz = awk_mb_cur_max;
1984 	int unused;
1985 
1986 	if (sz == 1) {
1987 		buf = tostring(s);
1988 
1989 		for (pbuf = buf; *pbuf; pbuf++)
1990 			*pbuf = fun_c((uschar)*pbuf);
1991 
1992 		return buf;
1993 	} else {
1994 		/* upper/lower character may be shorter/longer */
1995 		buf = tostringN(s, strlen(s) * sz + 1);
1996 
1997 		(void) mbtowc(NULL, NULL, 0);	/* reset internal state */
1998 		/*
1999 		 * Reset internal state here too.
2000 		 * Assign result to avoid a compiler warning. (Casting to void
2001 		 * doesn't work.)
2002 		 * Increment said variable to avoid a different warning.
2003 		 */
2004 		unused = wctomb(NULL, L'\0');
2005 		unused++;
2006 
2007 		ps   = s;
2008 		pbuf = buf;
2009 		while (n = mbtowc(&wc, ps, sz),
2010 		       n > 0 && n != (size_t)-1 && n != (size_t)-2)
2011 		{
2012 			ps += n;
2013 
2014 			n = wctomb(pbuf, fun_wc(wc));
2015 			if (n == (size_t)-1)
2016 				FATAL("illegal wide character %s", s);
2017 
2018 			pbuf += n;
2019 		}
2020 
2021 		*pbuf = '\0';
2022 
2023 		if (n)
2024 			FATAL("illegal byte sequence %s", s);
2025 
2026 		return buf;
2027 	}
2028 }
2029 
2030 #ifdef __DJGPP__
towupper(wint_t wc)2031 static wint_t towupper(wint_t wc)
2032 {
2033 	if (wc >= 0 && wc < 256)
2034 		return toupper(wc & 0xFF);
2035 
2036 	return wc;
2037 }
2038 
towlower(wint_t wc)2039 static wint_t towlower(wint_t wc)
2040 {
2041 	if (wc >= 0 && wc < 256)
2042 		return tolower(wc & 0xFF);
2043 
2044 	return wc;
2045 }
2046 #endif
2047 
nawk_toupper(const char * s)2048 static char *nawk_toupper(const char *s)
2049 {
2050 	return nawk_convert(s, toupper, towupper);
2051 }
2052 
nawk_tolower(const char * s)2053 static char *nawk_tolower(const char *s)
2054 {
2055 	return nawk_convert(s, tolower, towlower);
2056 }
2057 
2058 
2059 
bltin(Node ** a,int n)2060 Cell *bltin(Node **a, int n)	/* builtin functions. a[0] is type, a[1] is arg list */
2061 {
2062 	Cell *x, *y;
2063 	Awkfloat u = 0;
2064 	int t, sz;
2065 	Awkfloat tmp;
2066 	char *buf, *fmt;
2067 	Node *nextarg;
2068 	FILE *fp;
2069 	int status = 0;
2070 	time_t tv;
2071 	struct tm *tm, tmbuf;
2072 	int estatus = 0;
2073 
2074 	t = ptoi(a[0]);
2075 	x = execute(a[1]);
2076 	nextarg = a[1]->nnext;
2077 	switch (t) {
2078 	case FLENGTH:
2079 		if (isarr(x))
2080 			u = ((Array *) x->sval)->nelem;	/* GROT.  should be function*/
2081 		else
2082 			u = u8_strlen(getsval(x));
2083 		break;
2084 	case FLOG:
2085 		errno = 0;
2086 		u = errcheck(log(getfval(x)), "log");
2087 		break;
2088 	case FINT:
2089 		modf(getfval(x), &u); break;
2090 	case FEXP:
2091 		errno = 0;
2092 		u = errcheck(exp(getfval(x)), "exp");
2093 		break;
2094 	case FSQRT:
2095 		errno = 0;
2096 		u = errcheck(sqrt(getfval(x)), "sqrt");
2097 		break;
2098 	case FSIN:
2099 		u = sin(getfval(x)); break;
2100 	case FCOS:
2101 		u = cos(getfval(x)); break;
2102 	case FATAN:
2103 		if (nextarg == NULL) {
2104 			WARNING("atan2 requires two arguments; returning 1.0");
2105 			u = 1.0;
2106 		} else {
2107 			y = execute(a[1]->nnext);
2108 			u = atan2(getfval(x), getfval(y));
2109 			tempfree(y);
2110 			nextarg = nextarg->nnext;
2111 		}
2112 		break;
2113 	case FCOMPL:
2114 		u = ~((int)getfval(x));
2115 		break;
2116 	case FAND:
2117 		if (nextarg == 0) {
2118 			WARNING("and requires two arguments; returning 0");
2119 			u = 0;
2120 			break;
2121 		}
2122 		y = execute(a[1]->nnext);
2123 		u = ((int)getfval(x)) & ((int)getfval(y));
2124 		tempfree(y);
2125 		nextarg = nextarg->nnext;
2126 		break;
2127 	case FFOR:
2128 		if (nextarg == 0) {
2129 			WARNING("or requires two arguments; returning 0");
2130 			u = 0;
2131 			break;
2132 		}
2133 		y = execute(a[1]->nnext);
2134 		u = ((int)getfval(x)) | ((int)getfval(y));
2135 		tempfree(y);
2136 		nextarg = nextarg->nnext;
2137 		break;
2138 	case FXOR:
2139 		if (nextarg == 0) {
2140 			WARNING("xor requires two arguments; returning 0");
2141 			u = 0;
2142 			break;
2143 		}
2144 		y = execute(a[1]->nnext);
2145 		u = ((int)getfval(x)) ^ ((int)getfval(y));
2146 		tempfree(y);
2147 		nextarg = nextarg->nnext;
2148 		break;
2149 	case FLSHIFT:
2150 		if (nextarg == 0) {
2151 			WARNING("lshift requires two arguments; returning 0");
2152 			u = 0;
2153 			break;
2154 		}
2155 		y = execute(a[1]->nnext);
2156 		u = ((int)getfval(x)) << ((int)getfval(y));
2157 		tempfree(y);
2158 		nextarg = nextarg->nnext;
2159 		break;
2160 	case FRSHIFT:
2161 		if (nextarg == 0) {
2162 			WARNING("rshift requires two arguments; returning 0");
2163 			u = 0;
2164 			break;
2165 		}
2166 		y = execute(a[1]->nnext);
2167 		u = ((int)getfval(x)) >> ((int)getfval(y));
2168 		tempfree(y);
2169 		nextarg = nextarg->nnext;
2170 		break;
2171 	case FSYSTEM:
2172 		fflush(stdout);		/* in case something is buffered already */
2173 		estatus = status = system(getsval(x));
2174 		if (status != -1) {
2175 			if (WIFEXITED(status)) {
2176 				estatus = WEXITSTATUS(status);
2177 			} else if (WIFSIGNALED(status)) {
2178 				estatus = WTERMSIG(status) + 256;
2179 #ifdef WCOREDUMP
2180 				if (WCOREDUMP(status))
2181 					estatus += 256;
2182 #endif
2183 			} else	/* something else?!? */
2184 				estatus = 0;
2185 		}
2186 		/* else estatus was set to -1 */
2187 		u = estatus;
2188 		break;
2189 	case FRAND:
2190 		/* random() returns numbers in [0..2^31-1]
2191 		 * in order to get a number in [0, 1), divide it by 2^31
2192 		 */
2193 		u = (Awkfloat) random() / RAND_MAX;
2194 		break;
2195 	case FSRAND:
2196 		if (isrec(x))	/* no argument provided */
2197 			u = time((time_t *)0);
2198 		else
2199 			u = getfval(x);
2200 		tmp = u;
2201 		srandom((unsigned long) u);
2202 		u = srand_seed;
2203 		srand_seed = tmp;
2204 		break;
2205 	case FTOUPPER:
2206 	case FTOLOWER:
2207 		if (t == FTOUPPER)
2208 			buf = nawk_toupper(getsval(x));
2209 		else
2210 			buf = nawk_tolower(getsval(x));
2211 		tempfree(x);
2212 		x = gettemp();
2213 		setsval(x, buf);
2214 		free(buf);
2215 		return x;
2216 	case FFLUSH:
2217 		if (isrec(x) || strlen(getsval(x)) == 0) {
2218 			flush_all();	/* fflush() or fflush("") -> all */
2219 			u = 0;
2220 		} else if ((fp = openfile(FFLUSH, getsval(x), NULL)) == NULL)
2221 			u = EOF;
2222 		else
2223 			u = fflush(fp);
2224 		break;
2225 	case FMKTIME:
2226 		memset(&tmbuf, 0, sizeof(tmbuf));
2227 		tm = &tmbuf;
2228 		t = sscanf(getsval(x), "%d %d %d %d %d %d %d",
2229 		    &tm->tm_year, &tm->tm_mon, &tm->tm_mday, &tm->tm_hour,
2230 		    &tm->tm_min, &tm->tm_sec, &tm->tm_isdst);
2231 		switch (t) {
2232 		case 6:
2233 			tm->tm_isdst = -1;	/* let mktime figure it out */
2234 			/* FALLTHROUGH */
2235 		case 7:
2236 			tm->tm_year -= 1900;
2237 			tm->tm_mon--;
2238 			u = mktime(tm);
2239 			break;
2240 		default:
2241 			u = -1;
2242 			break;
2243 		}
2244 		break;
2245 	case FSYSTIME:
2246 		u = time((time_t *) 0);
2247 		break;
2248 	case FSTRFTIME:
2249 		/* strftime([format [,timestamp]]) */
2250 		if (nextarg) {
2251 			y = execute(nextarg);
2252 			nextarg = nextarg->nnext;
2253 			tv = (time_t) getfval(y);
2254 			tempfree(y);
2255 		} else
2256 			tv = time((time_t *) 0);
2257 		tm = localtime(&tv);
2258 		if (tm == NULL)
2259 			FATAL("bad time %ld", (long)tv);
2260 
2261 		if (isrec(x)) {
2262 			/* format argument not provided, use default */
2263 			fmt = tostring("%a %b %d %H:%M:%S %Z %Y");
2264 		} else
2265 			fmt = tostring(getsval(x));
2266 
2267 		sz = 32;
2268 		buf = NULL;
2269 		do {
2270 			if ((buf = realloc(buf, (sz *= 2))) == NULL)
2271 				FATAL("out of memory in strftime");
2272 		} while (strftime(buf, sz, fmt, tm) == 0 && fmt[0] != '\0');
2273 
2274 		y = gettemp();
2275 		setsval(y, buf);
2276 		free(fmt);
2277 		free(buf);
2278 
2279 		return y;
2280 	default:	/* can't happen */
2281 		FATAL("illegal function type %d", t);
2282 		break;
2283 	}
2284 	tempfree(x);
2285 	x = gettemp();
2286 	setfval(x, u);
2287 	if (nextarg != NULL) {
2288 		WARNING("warning: function has too many arguments");
2289 		for ( ; nextarg; nextarg = nextarg->nnext) {
2290 			y = execute(nextarg);
2291 			tempfree(y);
2292 		}
2293 	}
2294 	return(x);
2295 }
2296 
printstat(Node ** a,int n)2297 Cell *printstat(Node **a, int n)	/* print a[0] */
2298 {
2299 	Node *x;
2300 	Cell *y;
2301 	FILE *fp;
2302 
2303 	if (a[1] == NULL)	/* a[1] is redirection operator, a[2] is file */
2304 		fp = stdout;
2305 	else
2306 		fp = redirect(ptoi(a[1]), a[2]);
2307 	for (x = a[0]; x != NULL; x = x->nnext) {
2308 		y = execute(x);
2309 		fputs(getpssval(y), fp);
2310 		tempfree(y);
2311 		if (x->nnext == NULL)
2312 			fputs(getsval(orsloc), fp);
2313 		else
2314 			fputs(getsval(ofsloc), fp);
2315 	}
2316 	if (a[1] != NULL)
2317 		fflush(fp);
2318 	if (ferror(fp))
2319 		FATAL("write error on %s", filename(fp));
2320 	return(True);
2321 }
2322 
nullproc(Node ** a,int n)2323 Cell *nullproc(Node **a, int n)
2324 {
2325 	return 0;
2326 }
2327 
2328 
redirect(int a,Node * b)2329 FILE *redirect(int a, Node *b)	/* set up all i/o redirections */
2330 {
2331 	FILE *fp;
2332 	Cell *x;
2333 	char *fname;
2334 
2335 	x = execute(b);
2336 	fname = getsval(x);
2337 	fp = openfile(a, fname, NULL);
2338 	if (fp == NULL)
2339 		FATAL("can't open file %s", fname);
2340 	tempfree(x);
2341 	return fp;
2342 }
2343 
2344 struct files {
2345 	FILE	*fp;
2346 	const char	*fname;
2347 	int	mode;	/* '|', 'a', 'w' => LE/LT, GT */
2348 } *files;
2349 
2350 size_t nfiles;
2351 
stdinit(void)2352 static void stdinit(void)	/* in case stdin, etc., are not constants */
2353 {
2354 	nfiles = FOPEN_MAX;
2355 	files = (struct files *) calloc(nfiles, sizeof(*files));
2356 	if (files == NULL)
2357 		FATAL("can't allocate file memory for %zu files", nfiles);
2358         files[0].fp = stdin;
2359 	files[0].fname = tostring("/dev/stdin");
2360 	files[0].mode = LT;
2361         files[1].fp = stdout;
2362 	files[1].fname = tostring("/dev/stdout");
2363 	files[1].mode = GT;
2364         files[2].fp = stderr;
2365 	files[2].fname = tostring("/dev/stderr");
2366 	files[2].mode = GT;
2367 }
2368 
openfile(int a,const char * us,bool * pnewflag)2369 FILE *openfile(int a, const char *us, bool *pnewflag)
2370 {
2371 	const char *s = us;
2372 	size_t i;
2373 	int m;
2374 	FILE *fp = NULL;
2375 	struct stat sbuf;
2376 
2377 	if (*s == '\0')
2378 		FATAL("null file name in print or getline");
2379 
2380 	for (i = 0; i < nfiles; i++)
2381 		if (files[i].fname && strcmp(s, files[i].fname) == 0 &&
2382 		    (a == files[i].mode || (a==APPEND && files[i].mode==GT) ||
2383 		     a == FFLUSH)) {
2384 			if (pnewflag)
2385 				*pnewflag = false;
2386 			return files[i].fp;
2387 		}
2388 	if (a == FFLUSH)	/* didn't find it, so don't create it! */
2389 		return NULL;
2390 	for (i = 0; i < nfiles; i++)
2391 		if (files[i].fp == NULL)
2392 			break;
2393 	if (i >= nfiles) {
2394 		struct files *nf;
2395 		size_t nnf = nfiles + FOPEN_MAX;
2396 		nf = (struct files *) realloc(files, nnf * sizeof(*nf));
2397 		if (nf == NULL)
2398 			FATAL("cannot grow files for %s and %zu files", s, nnf);
2399 		memset(&nf[nfiles], 0, FOPEN_MAX * sizeof(*nf));
2400 		nfiles = nnf;
2401 		files = nf;
2402 	}
2403 
2404 	fflush(stdout);	/* force a semblance of order */
2405 
2406 	/* don't try to read or write a directory */
2407 	if (a == LT || a == GT || a == APPEND)
2408 		if (stat(s, &sbuf) == 0 && S_ISDIR(sbuf.st_mode))
2409 				return NULL;
2410 
2411 	m = a;
2412 	if (a == GT) {
2413 		fp = fopen(s, "w");
2414 	} else if (a == APPEND) {
2415 		fp = fopen(s, "a");
2416 		m = GT;	/* so can mix > and >> */
2417 	} else if (a == '|') {	/* output pipe */
2418 		fp = popen(s, "w");
2419 	} else if (a == LE) {	/* input pipe */
2420 		fp = popen(s, "r");
2421 	} else if (a == LT) {	/* getline <file */
2422 		fp = strcmp(s, "-") == 0 ? stdin : fopen(s, "r");	/* "-" is stdin */
2423 	} else	/* can't happen */
2424 		FATAL("illegal redirection %d", a);
2425 	if (fp != NULL) {
2426 		files[i].fname = tostring(s);
2427 		files[i].fp = fp;
2428 		files[i].mode = m;
2429 		if (pnewflag)
2430 			*pnewflag = true;
2431 		if (fp != stdin && fp != stdout && fp != stderr)
2432 			(void) fcntl(fileno(fp), F_SETFD, FD_CLOEXEC);
2433 	}
2434 	return fp;
2435 }
2436 
filename(FILE * fp)2437 const char *filename(FILE *fp)
2438 {
2439 	size_t i;
2440 
2441 	for (i = 0; i < nfiles; i++)
2442 		if (fp == files[i].fp)
2443 			return files[i].fname;
2444 	return "???";
2445 }
2446 
closefile(Node ** a,int n)2447 Cell *closefile(Node **a, int n)
2448 {
2449  	Cell *x;
2450 	size_t i;
2451 	bool stat;
2452 
2453  	x = execute(a[0]);
2454  	getsval(x);
2455 	stat = true;
2456  	for (i = 0; i < nfiles; i++) {
2457 		if (!files[i].fname || strcmp(x->sval, files[i].fname) != 0)
2458 			continue;
2459 		if (files[i].mode == GT || files[i].mode == '|')
2460 			fflush(files[i].fp);
2461 		if (ferror(files[i].fp)) {
2462 			if ((files[i].mode == GT && files[i].fp != stderr)
2463 			  || files[i].mode == '|')
2464 				FATAL("write error on %s", files[i].fname);
2465 			else
2466 				WARNING("i/o error occurred on %s", files[i].fname);
2467 		}
2468 		if (files[i].fp == stdin || files[i].fp == stdout ||
2469 		    files[i].fp == stderr)
2470 			stat = freopen("/dev/null", "r+", files[i].fp) == NULL;
2471 		else if (files[i].mode == '|' || files[i].mode == LE)
2472 			stat = pclose(files[i].fp) == -1;
2473 		else
2474 			stat = fclose(files[i].fp) == EOF;
2475 		if (stat)
2476 			WARNING("i/o error occurred closing %s", files[i].fname);
2477 		xfree(files[i].fname);
2478 		files[i].fname = NULL;	/* watch out for ref thru this */
2479 		files[i].fp = NULL;
2480 		break;
2481  	}
2482  	tempfree(x);
2483  	x = gettemp();
2484 	setfval(x, (Awkfloat) (stat ? -1 : 0));
2485  	return(x);
2486 }
2487 
closeall(void)2488 void closeall(void)
2489 {
2490 	size_t i;
2491 	bool stat = false;
2492 
2493 	for (i = 0; i < nfiles; i++) {
2494 		if (! files[i].fp)
2495 			continue;
2496 		if (files[i].mode == GT || files[i].mode == '|')
2497 			fflush(files[i].fp);
2498 		if (ferror(files[i].fp)) {
2499 			if ((files[i].mode == GT && files[i].fp != stderr)
2500 			  || files[i].mode == '|')
2501 				FATAL("write error on %s", files[i].fname);
2502 			else
2503 				WARNING("i/o error occurred on %s", files[i].fname);
2504 		}
2505 		if (files[i].fp == stdin || files[i].fp == stdout ||
2506 		    files[i].fp == stderr)
2507 			continue;
2508 		if (files[i].mode == '|' || files[i].mode == LE)
2509 			stat = pclose(files[i].fp) == -1;
2510 		else
2511 			stat = fclose(files[i].fp) == EOF;
2512 		if (stat)
2513 			WARNING("i/o error occurred while closing %s", files[i].fname);
2514 	}
2515 }
2516 
flush_all(void)2517 static void flush_all(void)
2518 {
2519 	size_t i;
2520 
2521 	for (i = 0; i < nfiles; i++)
2522 		if (files[i].fp)
2523 			fflush(files[i].fp);
2524 }
2525 
2526 void backsub(char **pb_ptr, const char **sptr_ptr);
2527 
dosub(Node ** a,int subop)2528 Cell *dosub(Node **a, int subop)        /* sub and gsub */
2529 {
2530 	fa *pfa;
2531 	int tempstat = 0;
2532 	char *repl;
2533 	Cell *x;
2534 
2535 	char *buf = NULL;
2536 	char *pb = NULL;
2537 	int bufsz = recsize;
2538 
2539 	const char *r, *s;
2540 	const char *start;
2541 	const char *noempty = NULL;      /* empty match disallowed here */
2542 	size_t m = 0;                    /* match count */
2543 	size_t whichm = 0;               /* which match to select, 0 = global */
2544 	int mtype;                       /* match type */
2545 
2546 	if (a[0] == NULL) {	/* 0 => a[1] is already-compiled regexpr */
2547 		pfa = (fa *) a[1];
2548 	} else {
2549 		x = execute(a[1]);
2550 		pfa = makedfa(getsval(x), 1);
2551 		tempfree(x);
2552 	}
2553 
2554 	x = execute(a[2]);	/* replacement string */
2555 	repl = tostring(getsval(x));
2556 	tempfree(x);
2557 
2558 	switch (subop) {
2559 	case SUB:
2560 		whichm = 1;
2561 		x = execute(a[3]);    /* source string */
2562 		break;
2563 	case GSUB:
2564 		whichm = 0;
2565 		x = execute(a[3]);    /* source string */
2566 		break;
2567 	default:
2568 		FATAL("dosub: unrecognized subop: %d", subop);
2569 	}
2570 
2571 	start = getsval(x);
2572 	while (pmatch(pfa, start)) {
2573 		if (buf == NULL) {
2574 			if ((pb = buf = (char *) malloc(bufsz)) == NULL)
2575 				FATAL("out of memory in dosub");
2576 			tempstat = pfa->initstat;
2577 			pfa->initstat = 2;
2578 		}
2579 
2580 		/* match types */
2581 		#define	MT_IGNORE  0  /* unselected or invalid */
2582 		#define MT_INSERT  1  /* selected, empty */
2583 		#define MT_REPLACE 2  /* selected, not empty */
2584 
2585 		/* an empty match just after replacement is invalid */
2586 
2587 		if (patbeg == noempty && patlen == 0) {
2588 			mtype = MT_IGNORE;    /* invalid, not counted */
2589 		} else if (whichm == ++m || whichm == 0) {
2590 			mtype = patlen ? MT_REPLACE : MT_INSERT;
2591 		} else {
2592 			mtype = MT_IGNORE;    /* unselected, but counted */
2593 		}
2594 
2595 		/* leading text: */
2596 		if (patbeg > start) {
2597 			adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),
2598 				recsize, &pb, "dosub");
2599 			s = start;
2600 			while (s < patbeg)
2601 				*pb++ = *s++;
2602 		}
2603 
2604 		if (mtype == MT_IGNORE)
2605 			goto matching_text;  /* skip replacement text */
2606 
2607 		r = repl;
2608 		while (*r != 0) {
2609 			adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");
2610 			if (*r == '\\') {
2611 				backsub(&pb, &r);
2612 			} else if (*r == '&') {
2613 				r++;
2614 				adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,
2615 					&pb, "dosub");
2616 				for (s = patbeg; s < patbeg+patlen; )
2617 					*pb++ = *s++;
2618 			} else {
2619 				*pb++ = *r++;
2620 			}
2621 		}
2622 
2623 matching_text:
2624 		if (mtype == MT_REPLACE || *patbeg == '\0')
2625 			goto next_search;  /* skip matching text */
2626 
2627 		if (patlen == 0)
2628 			patlen = u8_nextlen(patbeg);
2629 		adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");
2630 		s = patbeg;
2631 		while (s < patbeg + patlen)
2632 			*pb++ = *s++;
2633 
2634 next_search:
2635 		start = patbeg + patlen;
2636 		if (m == whichm || *patbeg == '\0')
2637 			break;
2638 		if (mtype == MT_REPLACE)
2639 			noempty = start;
2640 
2641 		#undef MT_IGNORE
2642 		#undef MT_INSERT
2643 		#undef MT_REPLACE
2644 	}
2645 
2646 	xfree(repl);
2647 
2648 	if (buf != NULL) {
2649 		pfa->initstat = tempstat;
2650 
2651 		/* trailing text */
2652 		adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");
2653 		while ((*pb++ = *start++) != '\0')
2654 			;
2655 
2656 		setsval(x, buf);
2657 		free(buf);
2658 	}
2659 
2660 	tempfree(x);
2661 	x = gettemp();
2662 	x->tval = NUM;
2663 	x->fval = m;
2664 	return x;
2665 }
2666 
gensub(Node ** a,int nnn)2667 Cell *gensub(Node **a, int nnn)	/* global selective substitute */
2668 	/* XXX incomplete - doesn't support backreferences \0 ... \9 */
2669 {
2670 	Cell *x, *y, *res, *h;
2671 	char *rptr;
2672 	const char *sptr;
2673 	char *buf, *pb;
2674 	const char *t, *q;
2675 	fa *pfa;
2676 	int mflag, tempstat, num, whichm;
2677 	int bufsz = recsize;
2678 
2679 	if ((buf = malloc(bufsz)) == NULL)
2680 		FATAL("out of memory in gensub");
2681 	mflag = 0;	/* if mflag == 0, can replace empty string */
2682 	num = 0;
2683 	x = execute(a[4]);	/* source string */
2684 	t = getsval(x);
2685 	res = copycell(x);	/* target string - initially copy of source */
2686 	res->csub = CTEMP;	/* result values are temporary */
2687 	if (a[0] == 0)		/* 0 => a[1] is already-compiled regexpr */
2688 		pfa = (fa *) a[1];	/* regular expression */
2689 	else {
2690 		y = execute(a[1]);
2691 		pfa = makedfa(getsval(y), 1);
2692 		tempfree(y);
2693 	}
2694 	y = execute(a[2]);	/* replacement string */
2695 	h = execute(a[3]);	/* which matches should be replaced */
2696 	sptr = getsval(h);
2697 	if (sptr[0] == 'g' || sptr[0] == 'G')
2698 		whichm = -1;
2699 	else {
2700 		/*
2701 		 * The specified number is index of replacement, starting
2702 		 * from 1. GNU awk treats index lower than 0 same as
2703 		 * 1, we do same for compatibility.
2704 		 */
2705 		whichm = (int) getfval(h) - 1;
2706 		if (whichm < 0)
2707 			whichm = 0;
2708 	}
2709 	tempfree(h);
2710 
2711 	if (pmatch(pfa, t)) {
2712 		char *sl;
2713 
2714 		tempstat = pfa->initstat;
2715 		pfa->initstat = 2;
2716 		pb = buf;
2717 		rptr = getsval(y);
2718 		/*
2719 		 * XXX if there are any backreferences in subst string,
2720 		 * complain now.
2721 		 */
2722 		for (sl = rptr; (sl = strchr(sl, '\\')) && sl[1]; sl++) {
2723 			if (strchr("0123456789", sl[1])) {
2724 				FATAL("gensub doesn't support backreferences (subst \"%s\")", rptr);
2725 			}
2726 		}
2727 
2728 		do {
2729 			if (whichm >= 0 && whichm != num) {
2730 				num++;
2731 				adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - t) + patlen, recsize, &pb, "gensub");
2732 
2733 				/* copy the part of string up to and including
2734 				 * match to output buffer */
2735 				while (t < patbeg + patlen)
2736 					*pb++ = *t++;
2737 				continue;
2738 			}
2739 
2740 			if (patlen == 0 && *patbeg != 0) {	/* matched empty string */
2741 				if (mflag == 0) {	/* can replace empty */
2742 					num++;
2743 					sptr = rptr;
2744 					while (*sptr != 0) {
2745 						adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2746 						if (*sptr == '\\') {
2747 							backsub(&pb, &sptr);
2748 						} else if (*sptr == '&') {
2749 							sptr++;
2750 							adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2751 							for (q = patbeg; q < patbeg+patlen; )
2752 								*pb++ = *q++;
2753 						} else
2754 							*pb++ = *sptr++;
2755 					}
2756 				}
2757 				if (*t == 0)	/* at end */
2758 					goto done;
2759 				adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gensub");
2760 				*pb++ = *t++;
2761 				if (pb > buf + bufsz)	/* BUG: not sure of this test */
2762 					FATAL("gensub result0 %.30s too big; can't happen", buf);
2763 				mflag = 0;
2764 			}
2765 			else {	/* matched nonempty string */
2766 				num++;
2767 				sptr = t;
2768 				adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gensub");
2769 				while (sptr < patbeg)
2770 					*pb++ = *sptr++;
2771 				sptr = rptr;
2772 				while (*sptr != 0) {
2773 					adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2774 					if (*sptr == '\\') {
2775 						backsub(&pb, &sptr);
2776 					} else if (*sptr == '&') {
2777 						sptr++;
2778 						adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2779 						for (q = patbeg; q < patbeg+patlen; )
2780 							*pb++ = *q++;
2781 					} else
2782 						*pb++ = *sptr++;
2783 				}
2784 				t = patbeg + patlen;
2785 				if (patlen == 0 || *t == 0 || *(t-1) == 0)
2786 					goto done;
2787 				if (pb > buf + bufsz)
2788 					FATAL("gensub result1 %.30s too big; can't happen", buf);
2789 				mflag = 1;
2790 			}
2791 		} while (pmatch(pfa,t));
2792 		sptr = t;
2793 		adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gensub");
2794 		while ((*pb++ = *sptr++) != 0)
2795 			;
2796 	done:	if (pb > buf + bufsz)
2797 			FATAL("gensub result2 %.30s too big; can't happen", buf);
2798 		*pb = '\0';
2799 		setsval(res, buf);
2800 		pfa->initstat = tempstat;
2801 	}
2802 	tempfree(x);
2803 	tempfree(y);
2804 	free(buf);
2805 	return(res);
2806 }
2807 
backsub(char ** pb_ptr,const char ** sptr_ptr)2808 void backsub(char **pb_ptr, const char **sptr_ptr)	/* handle \\& variations */
2809 {						/* sptr[0] == '\\' */
2810 	char *pb = *pb_ptr;
2811 	const char *sptr = *sptr_ptr;
2812 	static bool first = true;
2813 	static bool do_posix = false;
2814 
2815 	if (first) {
2816 		first = false;
2817 		do_posix = (getenv("POSIXLY_CORRECT") != NULL);
2818 	}
2819 
2820 	if (sptr[1] == '\\') {
2821 		if (sptr[2] == '\\' && sptr[3] == '&') { /* \\\& -> \& */
2822 			*pb++ = '\\';
2823 			*pb++ = '&';
2824 			sptr += 4;
2825 		} else if (sptr[2] == '&') {	/* \\& -> \ + matched */
2826 			*pb++ = '\\';
2827 			sptr += 2;
2828 		} else if (do_posix) {		/* \\x -> \x */
2829 			sptr++;
2830 			*pb++ = *sptr++;
2831 		} else {			/* \\x -> \\x */
2832 			*pb++ = *sptr++;
2833 			*pb++ = *sptr++;
2834 		}
2835 	} else if (sptr[1] == '&') {	/* literal & */
2836 		sptr++;
2837 		*pb++ = *sptr++;
2838 	} else				/* literal \ */
2839 		*pb++ = *sptr++;
2840 
2841 	*pb_ptr = pb;
2842 	*sptr_ptr = sptr;
2843 }
2844 
wide_char_to_byte_str(int rune,size_t * outlen)2845 static char *wide_char_to_byte_str(int rune, size_t *outlen)
2846 {
2847 	static char buf[5];
2848 	int len;
2849 
2850 	if (rune < 0 || rune > 0x10FFFF)
2851 		return NULL;
2852 
2853 	memset(buf, 0, sizeof(buf));
2854 
2855 	len = 0;
2856 	if (rune <= 0x0000007F) {
2857 		buf[len++] = rune;
2858 	} else if (rune <= 0x000007FF) {
2859 		// 110xxxxx 10xxxxxx
2860 		buf[len++] = 0xC0 | (rune >> 6);
2861 		buf[len++] = 0x80 | (rune & 0x3F);
2862 	} else if (rune <= 0x0000FFFF) {
2863 		// 1110xxxx 10xxxxxx 10xxxxxx
2864 		buf[len++] = 0xE0 | (rune >> 12);
2865 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2866 		buf[len++] = 0x80 | (rune & 0x3F);
2867 
2868 	} else {
2869 		// 0x00010000 - 0x10FFFF
2870 		// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2871 		buf[len++] = 0xF0 | (rune >> 18);
2872 		buf[len++] = 0x80 | ((rune >> 12) & 0x3F);
2873 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2874 		buf[len++] = 0x80 | (rune & 0x3F);
2875 	}
2876 
2877 	*outlen = len;
2878 	buf[len++] = '\0';
2879 
2880 	return buf;
2881 }
2882