xref: /freebsd/contrib/one-true-awk/run.c (revision 8d457988a72487b35ee3922671775d73169339e3)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #define DEBUG
26 #include <stdio.h>
27 #include <ctype.h>
28 #include <errno.h>
29 #include <wctype.h>
30 #include <fcntl.h>
31 #include <setjmp.h>
32 #include <limits.h>
33 #include <math.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include <time.h>
37 #include <sys/types.h>
38 #include <sys/wait.h>
39 #include "awk.h"
40 #include "awkgram.tab.h"
41 
42 
43 static void stdinit(void);
44 static void flush_all(void);
45 static char *wide_char_to_byte_str(int rune, size_t *outlen);
46 
47 #if 1
48 #define tempfree(x)	do { if (istemp(x)) tfree(x); } while (/*CONSTCOND*/0)
49 #else
tempfree(Cell * p)50 void tempfree(Cell *p) {
51 	if (p->ctype == OCELL && (p->csub < CUNK || p->csub > CFREE)) {
52 		WARNING("bad csub %d in Cell %d %s",
53 			p->csub, p->ctype, p->sval);
54 	}
55 	if (istemp(p))
56 		tfree(p);
57 }
58 #endif
59 
60 /* do we really need these? */
61 /* #ifdef _NFILE */
62 /* #ifndef FOPEN_MAX */
63 /* #define FOPEN_MAX _NFILE */
64 /* #endif */
65 /* #endif */
66 /*  */
67 /* #ifndef	FOPEN_MAX */
68 /* #define	FOPEN_MAX	40 */	/* max number of open files */
69 /* #endif */
70 /*  */
71 /* #ifndef RAND_MAX */
72 /* #define RAND_MAX	32767 */	/* all that ansi guarantees */
73 /* #endif */
74 
75 jmp_buf env;
76 extern	int	pairstack[];
77 extern	Awkfloat	srand_seed;
78 
79 Node	*winner = NULL;	/* root of parse tree */
80 Cell	*tmps;		/* free temporary cells for execution */
81 
82 static Cell	truecell	={ OBOOL, BTRUE, 0, 0, 1.0, NUM, NULL, NULL };
83 Cell	*True	= &truecell;
84 static Cell	falsecell	={ OBOOL, BFALSE, 0, 0, 0.0, NUM, NULL, NULL };
85 Cell	*False	= &falsecell;
86 static Cell	breakcell	={ OJUMP, JBREAK, 0, 0, 0.0, NUM, NULL, NULL };
87 Cell	*jbreak	= &breakcell;
88 static Cell	contcell	={ OJUMP, JCONT, 0, 0, 0.0, NUM, NULL, NULL };
89 Cell	*jcont	= &contcell;
90 static Cell	nextcell	={ OJUMP, JNEXT, 0, 0, 0.0, NUM, NULL, NULL };
91 Cell	*jnext	= &nextcell;
92 static Cell	nextfilecell	={ OJUMP, JNEXTFILE, 0, 0, 0.0, NUM, NULL, NULL };
93 Cell	*jnextfile	= &nextfilecell;
94 static Cell	exitcell	={ OJUMP, JEXIT, 0, 0, 0.0, NUM, NULL, NULL };
95 Cell	*jexit	= &exitcell;
96 static Cell	retcell		={ OJUMP, JRET, 0, 0, 0.0, NUM, NULL, NULL };
97 Cell	*jret	= &retcell;
98 static Cell	tempcell	={ OCELL, CTEMP, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
99 
100 Node	*curnode = NULL;	/* the node being executed, for debugging */
101 
102 /* buffer memory management */
adjbuf(char ** pbuf,int * psiz,int minlen,int quantum,char ** pbptr,const char * whatrtn)103 int adjbuf(char **pbuf, int *psiz, int minlen, int quantum, char **pbptr,
104 	const char *whatrtn)
105 /* pbuf:    address of pointer to buffer being managed
106  * psiz:    address of buffer size variable
107  * minlen:  minimum length of buffer needed
108  * quantum: buffer size quantum
109  * pbptr:   address of movable pointer into buffer, or 0 if none
110  * whatrtn: name of the calling routine if failure should cause fatal error
111  *
112  * return   0 for realloc failure, !=0 for success
113  */
114 {
115 	if (minlen > *psiz) {
116 		char *tbuf;
117 		int rminlen = quantum ? minlen % quantum : 0;
118 		int boff = pbptr ? *pbptr - *pbuf : 0;
119 		/* round up to next multiple of quantum */
120 		if (rminlen)
121 			minlen += quantum - rminlen;
122 		tbuf = (char *) realloc(*pbuf, minlen);
123 		DPRINTF("adjbuf %s: %d %d (pbuf=%p, tbuf=%p)\n", whatrtn, *psiz, minlen, (void*)*pbuf, (void*)tbuf);
124 		if (tbuf == NULL) {
125 			if (whatrtn)
126 				FATAL("out of memory in %s", whatrtn);
127 			return 0;
128 		}
129 		*pbuf = tbuf;
130 		*psiz = minlen;
131 		if (pbptr)
132 			*pbptr = tbuf + boff;
133 	}
134 	return 1;
135 }
136 
run(Node * a)137 void run(Node *a)	/* execution of parse tree starts here */
138 {
139 
140 	stdinit();
141 	execute(a);
142 	closeall();
143 }
144 
execute(Node * u)145 Cell *execute(Node *u)	/* execute a node of the parse tree */
146 {
147 	Cell *(*proc)(Node **, int);
148 	Cell *x;
149 	Node *a;
150 
151 	if (u == NULL)
152 		return(True);
153 	for (a = u; ; a = a->nnext) {
154 		curnode = a;
155 		if (isvalue(a)) {
156 			x = (Cell *) (a->narg[0]);
157 			if (isfld(x) && !donefld)
158 				fldbld();
159 			else if (isrec(x) && !donerec)
160 				recbld();
161 			return(x);
162 		}
163 		if (notlegal(a->nobj))	/* probably a Cell* but too risky to print */
164 			FATAL("illegal statement");
165 		proc = proctab[a->nobj-FIRSTTOKEN];
166 		x = (*proc)(a->narg, a->nobj);
167 		if (isfld(x) && !donefld)
168 			fldbld();
169 		else if (isrec(x) && !donerec)
170 			recbld();
171 		if (isexpr(a))
172 			return(x);
173 		if (isjump(x))
174 			return(x);
175 		if (a->nnext == NULL)
176 			return(x);
177 		tempfree(x);
178 	}
179 }
180 
181 
program(Node ** a,int n)182 Cell *program(Node **a, int n)	/* execute an awk program */
183 {				/* a[0] = BEGIN, a[1] = body, a[2] = END */
184 	Cell *x;
185 
186 	if (setjmp(env) != 0)
187 		goto ex;
188 	if (a[0]) {		/* BEGIN */
189 		x = execute(a[0]);
190 		if (isexit(x))
191 			return(True);
192 		if (isjump(x))
193 			FATAL("illegal break, continue, next or nextfile from BEGIN");
194 		tempfree(x);
195 	}
196 	if (a[1] || a[2])
197 		while (getrec(&record, &recsize, true) > 0) {
198 			x = execute(a[1]);
199 			if (isexit(x))
200 				break;
201 			tempfree(x);
202 		}
203   ex:
204 	if (setjmp(env) != 0)	/* handles exit within END */
205 		goto ex1;
206 	if (a[2]) {		/* END */
207 		x = execute(a[2]);
208 		if (isbreak(x) || isnext(x) || iscont(x))
209 			FATAL("illegal break, continue, next or nextfile from END");
210 		tempfree(x);
211 	}
212   ex1:
213 	return(True);
214 }
215 
216 struct Frame {	/* stack frame for awk function calls */
217 	int nargs;	/* number of arguments in this call */
218 	Cell *fcncell;	/* pointer to Cell for function */
219 	Cell **args;	/* pointer to array of arguments after execute */
220 	Cell *retval;	/* return value */
221 };
222 
223 #define	NARGS	50	/* max args in a call */
224 
225 struct Frame *frame = NULL;	/* base of stack frames; dynamically allocated */
226 int	nframe = 0;		/* number of frames allocated */
227 struct Frame *frp = NULL;	/* frame pointer. bottom level unused */
228 
call(Node ** a,int n)229 Cell *call(Node **a, int n)	/* function call.  very kludgy and fragile */
230 {
231 	static const Cell newcopycell = { OCELL, CCOPY, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
232 	int i, ncall, ndef;
233 	int freed = 0; /* handles potential double freeing when fcn & param share a tempcell */
234 	Node *x;
235 	Cell *args[NARGS], *oargs[NARGS];	/* BUG: fixed size arrays */
236 	Cell *y, *z, *fcn;
237 	char *s;
238 
239 	fcn = execute(a[0]);	/* the function itself */
240 	s = fcn->nval;
241 	if (!isfcn(fcn))
242 		FATAL("calling undefined function %s", s);
243 	if (frame == NULL) {
244 		frp = frame = (struct Frame *) calloc(nframe += 100, sizeof(*frame));
245 		if (frame == NULL)
246 			FATAL("out of space for stack frames calling %s", s);
247 	}
248 	for (ncall = 0, x = a[1]; x != NULL; x = x->nnext)	/* args in call */
249 		ncall++;
250 	ndef = (int) fcn->fval;			/* args in defn */
251 	DPRINTF("calling %s, %d args (%d in defn), frp=%d\n", s, ncall, ndef, (int) (frp-frame));
252 	if (ncall > ndef)
253 		WARNING("function %s called with %d args, uses only %d",
254 			s, ncall, ndef);
255 	if (ncall + ndef > NARGS)
256 		FATAL("function %s has %d arguments, limit %d", s, ncall+ndef, NARGS);
257 	for (i = 0, x = a[1]; x != NULL; i++, x = x->nnext) {	/* get call args */
258 		DPRINTF("evaluate args[%d], frp=%d:\n", i, (int) (frp-frame));
259 		y = execute(x);
260 		oargs[i] = y;
261 		DPRINTF("args[%d]: %s %f <%s>, t=%o\n",
262 			i, NN(y->nval), y->fval, isarr(y) ? "(array)" : NN(y->sval), y->tval);
263 		if (isfcn(y))
264 			FATAL("can't use function %s as argument in %s", y->nval, s);
265 		if (isarr(y))
266 			args[i] = y;	/* arrays by ref */
267 		else
268 			args[i] = copycell(y);
269 		tempfree(y);
270 	}
271 	for ( ; i < ndef; i++) {	/* add null args for ones not provided */
272 		args[i] = gettemp();
273 		*args[i] = newcopycell;
274 	}
275 	frp++;	/* now ok to up frame */
276 	if (frp >= frame + nframe) {
277 		int dfp = frp - frame;	/* old index */
278 		frame = (struct Frame *) realloc(frame, (nframe += 100) * sizeof(*frame));
279 		if (frame == NULL)
280 			FATAL("out of space for stack frames in %s", s);
281 		frp = frame + dfp;
282 	}
283 	frp->fcncell = fcn;
284 	frp->args = args;
285 	frp->nargs = ndef;	/* number defined with (excess are locals) */
286 	frp->retval = gettemp();
287 
288 	DPRINTF("start exec of %s, frp=%d\n", s, (int) (frp-frame));
289 	y = execute((Node *)(fcn->sval));	/* execute body */
290 	DPRINTF("finished exec of %s, frp=%d\n", s, (int) (frp-frame));
291 
292 	for (i = 0; i < ndef; i++) {
293 		Cell *t = frp->args[i];
294 		if (isarr(t)) {
295 			if (t->csub == CCOPY) {
296 				if (i >= ncall) {
297 					freesymtab(t);
298 					t->csub = CTEMP;
299 					tempfree(t);
300 				} else {
301 					oargs[i]->tval = t->tval;
302 					oargs[i]->tval &= ~(STR|NUM|DONTFREE);
303 					oargs[i]->sval = t->sval;
304 					tempfree(t);
305 				}
306 			}
307 		} else if (t != y) {	/* kludge to prevent freeing twice */
308 			t->csub = CTEMP;
309 			tempfree(t);
310 		} else if (t == y && t->csub == CCOPY) {
311 			t->csub = CTEMP;
312 			tempfree(t);
313 			freed = 1;
314 		}
315 	}
316 	tempfree(fcn);
317 	if (isexit(y) || isnext(y))
318 		return y;
319 	if (freed == 0) {
320 		tempfree(y);	/* don't free twice! */
321 	}
322 	z = frp->retval;			/* return value */
323 	DPRINTF("%s returns %g |%s| %o\n", s, getfval(z), getsval(z), z->tval);
324 	frp--;
325 	return(z);
326 }
327 
copycell(Cell * x)328 Cell *copycell(Cell *x)	/* make a copy of a cell in a temp */
329 {
330 	Cell *y;
331 
332 	/* copy is not constant or field */
333 
334 	y = gettemp();
335 	y->tval = x->tval & ~(CON|FLD|REC);
336 	y->csub = CCOPY;	/* prevents freeing until call is over */
337 	y->nval = x->nval;	/* BUG? */
338 	if (isstr(x) /* || x->ctype == OCELL */) {
339 		y->sval = tostring(x->sval);
340 		y->tval &= ~DONTFREE;
341 	} else
342 		y->tval |= DONTFREE;
343 	y->fval = x->fval;
344 	return y;
345 }
346 
arg(Node ** a,int n)347 Cell *arg(Node **a, int n)	/* nth argument of a function */
348 {
349 
350 	n = ptoi(a[0]);	/* argument number, counting from 0 */
351 	DPRINTF("arg(%d), frp->nargs=%d\n", n, frp->nargs);
352 	if (n+1 > frp->nargs)
353 		FATAL("argument #%d of function %s was not supplied",
354 			n+1, frp->fcncell->nval);
355 	return frp->args[n];
356 }
357 
jump(Node ** a,int n)358 Cell *jump(Node **a, int n)	/* break, continue, next, nextfile, return */
359 {
360 	Cell *y;
361 
362 	switch (n) {
363 	case EXIT:
364 		if (a[0] != NULL) {
365 			y = execute(a[0]);
366 			errorflag = (int) getfval(y);
367 			tempfree(y);
368 		}
369 		longjmp(env, 1);
370 	case RETURN:
371 		if (a[0] != NULL) {
372 			y = execute(a[0]);
373 			if ((y->tval & (STR|NUM)) == (STR|NUM)) {
374 				setsval(frp->retval, getsval(y));
375 				frp->retval->fval = getfval(y);
376 				frp->retval->tval |= NUM;
377 			}
378 			else if (y->tval & STR)
379 				setsval(frp->retval, getsval(y));
380 			else if (y->tval & NUM)
381 				setfval(frp->retval, getfval(y));
382 			else		/* can't happen */
383 				FATAL("bad type variable %d", y->tval);
384 			tempfree(y);
385 		}
386 		return(jret);
387 	case NEXT:
388 		return(jnext);
389 	case NEXTFILE:
390 		nextfile();
391 		return(jnextfile);
392 	case BREAK:
393 		return(jbreak);
394 	case CONTINUE:
395 		return(jcont);
396 	default:	/* can't happen */
397 		FATAL("illegal jump type %d", n);
398 	}
399 	return 0;	/* not reached */
400 }
401 
awkgetline(Node ** a,int n)402 Cell *awkgetline(Node **a, int n)	/* get next line from specific input */
403 {		/* a[0] is variable, a[1] is operator, a[2] is filename */
404 	Cell *r, *x;
405 	extern Cell **fldtab;
406 	FILE *fp;
407 	char *buf;
408 	int bufsize = recsize;
409 	int mode;
410 	bool newflag;
411 	double result;
412 
413 	if ((buf = (char *) malloc(bufsize)) == NULL)
414 		FATAL("out of memory in getline");
415 
416 	fflush(stdout);	/* in case someone is waiting for a prompt */
417 	r = gettemp();
418 	if (a[1] != NULL) {		/* getline < file */
419 		x = execute(a[2]);		/* filename */
420 		mode = ptoi(a[1]);
421 		if (mode == '|')		/* input pipe */
422 			mode = LE;	/* arbitrary flag */
423 		fp = openfile(mode, getsval(x), &newflag);
424 		tempfree(x);
425 		if (fp == NULL)
426 			n = -1;
427 		else
428 			n = readrec(&buf, &bufsize, fp, newflag);
429 		if (n <= 0) {
430 			;
431 		} else if (a[0] != NULL) {	/* getline var <file */
432 			x = execute(a[0]);
433 			setsval(x, buf);
434 			if (is_number(x->sval, & result)) {
435 				x->fval = result;
436 				x->tval |= NUM;
437 			}
438 			tempfree(x);
439 		} else {			/* getline <file */
440 			setsval(fldtab[0], buf);
441 			if (is_number(fldtab[0]->sval, & result)) {
442 				fldtab[0]->fval = result;
443 				fldtab[0]->tval |= NUM;
444 			}
445 		}
446 	} else {			/* bare getline; use current input */
447 		if (a[0] == NULL)	/* getline */
448 			n = getrec(&record, &recsize, true);
449 		else {			/* getline var */
450 			n = getrec(&buf, &bufsize, false);
451 			if (n > 0) {
452 				x = execute(a[0]);
453 				setsval(x, buf);
454 				if (is_number(x->sval, & result)) {
455 					x->fval = result;
456 					x->tval |= NUM;
457 				}
458 				tempfree(x);
459 			}
460 		}
461 	}
462 	setfval(r, (Awkfloat) n);
463 	free(buf);
464 	return r;
465 }
466 
getnf(Node ** a,int n)467 Cell *getnf(Node **a, int n)	/* get NF */
468 {
469 	if (!donefld)
470 		fldbld();
471 	return (Cell *) a[0];
472 }
473 
474 static char *
makearraystring(Node * p,const char * func)475 makearraystring(Node *p, const char *func)
476 {
477 	char *buf;
478 	int bufsz = recsize;
479 	size_t blen;
480 
481 	if ((buf = (char *) malloc(bufsz)) == NULL) {
482 		FATAL("%s: out of memory", func);
483 	}
484 
485 	blen = 0;
486 	buf[blen] = '\0';
487 
488 	for (; p; p = p->nnext) {
489 		Cell *x = execute(p);	/* expr */
490 		char *s = getsval(x);
491 		size_t seplen = strlen(getsval(subseploc));
492 		size_t nsub = p->nnext ? seplen : 0;
493 		size_t slen = strlen(s);
494 		size_t tlen = blen + slen + nsub;
495 
496 		if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
497 			FATAL("%s: out of memory %s[%s...]",
498 			    func, x->nval, buf);
499 		}
500 		memcpy(buf + blen, s, slen);
501 		if (nsub) {
502 			memcpy(buf + blen + slen, *SUBSEP, nsub);
503 		}
504 		buf[tlen] = '\0';
505 		blen = tlen;
506 		tempfree(x);
507 	}
508 	return buf;
509 }
510 
array(Node ** a,int n)511 Cell *array(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
512 {
513 	Cell *x, *z;
514 	char *buf;
515 
516 	x = execute(a[0]);	/* Cell* for symbol table */
517 	buf = makearraystring(a[1], __func__);
518 	if (!isarr(x)) {
519 		DPRINTF("making %s into an array\n", NN(x->nval));
520 		if (freeable(x))
521 			xfree(x->sval);
522 		x->tval &= ~(STR|NUM|DONTFREE);
523 		x->tval |= ARR;
524 		x->sval = (char *) makesymtab(NSYMTAB);
525 	}
526 	z = setsymtab(buf, "", 0.0, STR|NUM, (Array *) x->sval);
527 	z->ctype = OCELL;
528 	z->csub = CVAR;
529 	tempfree(x);
530 	free(buf);
531 	return(z);
532 }
533 
awkdelete(Node ** a,int n)534 Cell *awkdelete(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
535 {
536 	Cell *x;
537 
538 	x = execute(a[0]);	/* Cell* for symbol table */
539 	if (x == symtabloc) {
540 		FATAL("cannot delete SYMTAB or its elements");
541 	}
542 	if (!isarr(x))
543 		return True;
544 	if (a[1] == NULL) {	/* delete the elements, not the table */
545 		freesymtab(x);
546 		x->tval &= ~STR;
547 		x->tval |= ARR;
548 		x->sval = (char *) makesymtab(NSYMTAB);
549 	} else {
550 		char *buf = makearraystring(a[1], __func__);
551 		freeelem(x, buf);
552 		free(buf);
553 	}
554 	tempfree(x);
555 	return True;
556 }
557 
intest(Node ** a,int n)558 Cell *intest(Node **a, int n)	/* a[0] is index (list), a[1] is symtab */
559 {
560 	Cell *ap, *k;
561 	char *buf;
562 
563 	ap = execute(a[1]);	/* array name */
564 	if (!isarr(ap)) {
565 		DPRINTF("making %s into an array\n", ap->nval);
566 		if (freeable(ap))
567 			xfree(ap->sval);
568 		ap->tval &= ~(STR|NUM|DONTFREE);
569 		ap->tval |= ARR;
570 		ap->sval = (char *) makesymtab(NSYMTAB);
571 	}
572 	buf = makearraystring(a[0], __func__);
573 	k = lookup(buf, (Array *) ap->sval);
574 	tempfree(ap);
575 	free(buf);
576 	if (k == NULL)
577 		return(False);
578 	else
579 		return(True);
580 }
581 
582 
583 /* ======== utf-8 code ========== */
584 
585 /*
586  * Awk strings can contain ascii, random 8-bit items (eg Latin-1),
587  * or utf-8.  u8_isutf tests whether a string starts with a valid
588  * utf-8 sequence, and returns 0 if not (e.g., high bit set).
589  * u8_nextlen returns length of next valid sequence, which is
590  * 1 for ascii, 2..4 for utf-8, or 1 for high bit non-utf.
591  * u8_strlen returns length of string in valid utf-8 sequences
592  * and/or high-bit bytes.  Conversion functions go between byte
593  * number and character number.
594  *
595  * In theory, this behaves the same as before for non-utf8 bytes.
596  *
597  * Limited checking! This is a potential security hole.
598  */
599 
600 /* is s the beginning of a valid utf-8 string? */
601 /* return length 1..4 if yes, 0 if no */
u8_isutf(const char * s)602 int u8_isutf(const char *s)
603 {
604 	int n, ret;
605 	unsigned char c;
606 
607 	c = s[0];
608 	if (c < 128 || awk_mb_cur_max == 1)
609 		return 1; /* what if it's 0? */
610 
611 	n = strlen(s);
612 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
613 		ret = 2; /* 110xxxxx 10xxxxxx */
614 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
615 			 && (s[2] & 0xC0) == 0x80) {
616 		ret = 3; /* 1110xxxx 10xxxxxx 10xxxxxx */
617 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
618 			 && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
619 		ret = 4; /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
620 	} else {
621 		ret = 0;
622 	}
623 	return ret;
624 }
625 
626 /* Convert (prefix of) utf8 string to utf-32 rune. */
627 /* Sets *rune to the value, returns the length. */
628 /* No error checking: watch out. */
u8_rune(int * rune,const char * s)629 int u8_rune(int *rune, const char *s)
630 {
631 	int n, ret;
632 	unsigned char c;
633 
634 	c = s[0];
635 	if (c < 128 || awk_mb_cur_max == 1) {
636 		*rune = c;
637 		return 1;
638 	}
639 
640 	n = strlen(s);
641 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
642 		*rune = ((c & 0x1F) << 6) | (s[1] & 0x3F); /* 110xxxxx 10xxxxxx */
643 		ret = 2;
644 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
645 			  && (s[2] & 0xC0) == 0x80) {
646 		*rune = ((c & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
647 			/* 1110xxxx 10xxxxxx 10xxxxxx */
648 		ret = 3;
649 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
650 			  && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
651 		*rune = ((c & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
652 			/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
653 		ret = 4;
654 	} else {
655 		*rune = c;
656 		ret = 1;
657 	}
658 	return ret; /* returns one byte if sequence doesn't look like utf */
659 }
660 
661 /* return length of next sequence: 1 for ascii or random, 2..4 for valid utf8 */
u8_nextlen(const char * s)662 int u8_nextlen(const char *s)
663 {
664 	int len;
665 
666 	len = u8_isutf(s);
667 	if (len == 0)
668 		len = 1;
669 	return len;
670 }
671 
672 /* return number of utf characters or single non-utf bytes */
u8_strlen(const char * s)673 int u8_strlen(const char *s)
674 {
675 	int i, len, n, totlen;
676 	unsigned char c;
677 
678 	n = strlen(s);
679 	totlen = 0;
680 	for (i = 0; i < n; i += len) {
681 		c = s[i];
682 		if (c < 128 || awk_mb_cur_max == 1) {
683 			len = 1;
684 		} else {
685 			len = u8_nextlen(&s[i]);
686 		}
687 		totlen++;
688 		if (i > n)
689 			FATAL("bad utf count [%s] n=%d i=%d\n", s, n, i);
690 	}
691 	return totlen;
692 }
693 
694 /* convert utf-8 char number in a string to its byte offset */
u8_char2byte(const char * s,int charnum)695 int u8_char2byte(const char *s, int charnum)
696 {
697 	int n;
698 	int bytenum = 0;
699 
700 	while (charnum > 0) {
701 		n = u8_nextlen(s);
702 		s += n;
703 		bytenum += n;
704 		charnum--;
705 	}
706 	return bytenum;
707 }
708 
709 /* convert byte offset in s to utf-8 char number that starts there */
u8_byte2char(const char * s,int bytenum)710 int u8_byte2char(const char *s, int bytenum)
711 {
712 	int i, len, b;
713 	int charnum = 0; /* BUG: what origin? */
714 	/* should be 0 to match start==0 which means no match */
715 
716 	b = strlen(s);
717 	if (bytenum > b) {
718 		return -1; /* ??? */
719 	}
720 	for (i = 0; i <= bytenum; i += len) {
721 		len = u8_nextlen(s+i);
722 		charnum++;
723 	}
724 	return charnum;
725 }
726 
727 /* runetochar() adapted from rune.c in the Plan 9 distribution */
728 
729 enum
730 {
731 	Runeerror = 128, /* from somewhere else */
732 	Runemax = 0x10FFFF,
733 
734 	Bit1    = 7,
735 	Bitx    = 6,
736 	Bit2    = 5,
737 	Bit3    = 4,
738 	Bit4    = 3,
739 	Bit5    = 2,
740 
741 	T1      = ((1<<(Bit1+1))-1) ^ 0xFF,     /* 0000 0000 */
742 	Tx      = ((1<<(Bitx+1))-1) ^ 0xFF,     /* 1000 0000 */
743 	T2      = ((1<<(Bit2+1))-1) ^ 0xFF,     /* 1100 0000 */
744 	T3      = ((1<<(Bit3+1))-1) ^ 0xFF,     /* 1110 0000 */
745 	T4      = ((1<<(Bit4+1))-1) ^ 0xFF,     /* 1111 0000 */
746 	T5      = ((1<<(Bit5+1))-1) ^ 0xFF,     /* 1111 1000 */
747 
748 	Rune1   = (1<<(Bit1+0*Bitx))-1,	 	/* 0000 0000 0000 0000 0111 1111 */
749 	Rune2   = (1<<(Bit2+1*Bitx))-1,	 	/* 0000 0000 0000 0111 1111 1111 */
750 	Rune3   = (1<<(Bit3+2*Bitx))-1,	 	/* 0000 0000 1111 1111 1111 1111 */
751 	Rune4   = (1<<(Bit4+3*Bitx))-1,	 	/* 0011 1111 1111 1111 1111 1111 */
752 
753 	Maskx   = (1<<Bitx)-1,		  	/* 0011 1111 */
754 	Testx   = Maskx ^ 0xFF,		 	/* 1100 0000 */
755 
756 };
757 
runetochar(char * str,int c)758 int runetochar(char *str, int c)
759 {
760 	/* one character sequence 00000-0007F => 00-7F */
761 	if (c <= Rune1) {
762 		str[0] = c;
763 		return 1;
764 	}
765 
766 	/* two character sequence 00080-007FF => T2 Tx */
767 	if (c <= Rune2) {
768 		str[0] = T2 | (c >> 1*Bitx);
769 		str[1] = Tx | (c & Maskx);
770 		return 2;
771 	}
772 
773 	/* three character sequence 00800-0FFFF => T3 Tx Tx */
774 	if (c > Runemax)
775 		c = Runeerror;
776 	if (c <= Rune3) {
777 		str[0] = T3 |  (c >> 2*Bitx);
778 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
779 		str[2] = Tx |  (c & Maskx);
780 		return 3;
781 	}
782 
783 	/* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */
784 	str[0] = T4 |  (c >> 3*Bitx);
785 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
786 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
787 	str[3] = Tx |  (c & Maskx);
788 	return 4;
789 }
790 
791 
792 /* ========== end of utf8 code =========== */
793 
794 
795 
matchop(Node ** a,int n)796 Cell *matchop(Node **a, int n)	/* ~ and match() */
797 {
798 	Cell *x, *y, *z;
799 	char *s, *t;
800 	int i;
801 	int cstart, cpatlen, len;
802 	fa *pfa;
803 	int (*mf)(fa *, const char *) = match, mode = 0;
804 
805 	if (n == MATCHFCN) {
806 		mf = pmatch;
807 		mode = 1;
808 	}
809 	x = execute(a[1]);	/* a[1] = target text */
810 	s = getsval(x);
811 	if (a[0] == NULL)	/* a[1] == 0: already-compiled reg expr */
812 		i = (*mf)((fa *) a[2], s);
813 	else {
814 		y = execute(a[2]);	/* a[2] = regular expr */
815 		t = getsval(y);
816 		pfa = makedfa(t, mode);
817 		i = (*mf)(pfa, s);
818 		tempfree(y);
819 	}
820 	z = x;
821 	if (n == MATCHFCN) {
822 		int start = patbeg - s + 1; /* origin 1 */
823 		if (patlen < 0) {
824 			start = 0; /* not found */
825 		} else {
826 			cstart = u8_byte2char(s, start-1);
827 			cpatlen = 0;
828 			for (i = 0; i < patlen; i += len) {
829 				len = u8_nextlen(patbeg+i);
830 				cpatlen++;
831 			}
832 
833 			start = cstart;
834 			patlen = cpatlen;
835 		}
836 
837 		setfval(rstartloc, (Awkfloat) start);
838 		setfval(rlengthloc, (Awkfloat) patlen);
839 		x = gettemp();
840 		x->tval = NUM;
841 		x->fval = start;
842 	} else if ((n == MATCH && i == 1) || (n == NOTMATCH && i == 0))
843 		x = True;
844 	else
845 		x = False;
846 
847 	tempfree(z);
848 	return x;
849 }
850 
851 
boolop(Node ** a,int n)852 Cell *boolop(Node **a, int n)	/* a[0] || a[1], a[0] && a[1], !a[0] */
853 {
854 	Cell *x, *y;
855 	int i;
856 
857 	x = execute(a[0]);
858 	i = istrue(x);
859 	tempfree(x);
860 	switch (n) {
861 	case BOR:
862 		if (i) return(True);
863 		y = execute(a[1]);
864 		i = istrue(y);
865 		tempfree(y);
866 		if (i) return(True);
867 		else return(False);
868 	case AND:
869 		if ( !i ) return(False);
870 		y = execute(a[1]);
871 		i = istrue(y);
872 		tempfree(y);
873 		if (i) return(True);
874 		else return(False);
875 	case NOT:
876 		if (i) return(False);
877 		else return(True);
878 	default:	/* can't happen */
879 		FATAL("unknown boolean operator %d", n);
880 	}
881 	return 0;	/*NOTREACHED*/
882 }
883 
relop(Node ** a,int n)884 Cell *relop(Node **a, int n)	/* a[0 < a[1], etc. */
885 {
886 	int i;
887 	Cell *x, *y;
888 	Awkfloat j;
889 	bool x_is_nan, y_is_nan;
890 
891 	x = execute(a[0]);
892 	y = execute(a[1]);
893 	x_is_nan = isnan(x->fval);
894 	y_is_nan = isnan(y->fval);
895 	if (x->tval&NUM && y->tval&NUM) {
896 		if ((x_is_nan || y_is_nan) && n != NE)
897 			return(False);
898 		j = x->fval - y->fval;
899 		i = j<0? -1: (j>0? 1: 0);
900 	} else {
901 		i = strcmp(getsval(x), getsval(y));
902 	}
903 	tempfree(x);
904 	tempfree(y);
905 	switch (n) {
906 	case LT:	if (i<0) return(True);
907 			else return(False);
908 	case LE:	if (i<=0) return(True);
909 			else return(False);
910 	case NE:	if (x_is_nan && y_is_nan) return(True);
911 			else if (i!=0) return(True);
912 			else return(False);
913 	case EQ:	if (i == 0) return(True);
914 			else return(False);
915 	case GE:	if (i>=0) return(True);
916 			else return(False);
917 	case GT:	if (i>0) return(True);
918 			else return(False);
919 	default:	/* can't happen */
920 		FATAL("unknown relational operator %d", n);
921 	}
922 	return 0;	/*NOTREACHED*/
923 }
924 
tfree(Cell * a)925 void tfree(Cell *a)	/* free a tempcell */
926 {
927 	if (freeable(a)) {
928 		DPRINTF("freeing %s %s %o\n", NN(a->nval), NN(a->sval), a->tval);
929 		xfree(a->sval);
930 	}
931 	if (a == tmps)
932 		FATAL("tempcell list is curdled");
933 	a->cnext = tmps;
934 	tmps = a;
935 }
936 
gettemp(void)937 Cell *gettemp(void)	/* get a tempcell */
938 {	int i;
939 	Cell *x;
940 
941 	if (!tmps) {
942 		tmps = (Cell *) calloc(100, sizeof(*tmps));
943 		if (!tmps)
944 			FATAL("out of space for temporaries");
945 		for (i = 1; i < 100; i++)
946 			tmps[i-1].cnext = &tmps[i];
947 		tmps[i-1].cnext = NULL;
948 	}
949 	x = tmps;
950 	tmps = x->cnext;
951 	*x = tempcell;
952 	return(x);
953 }
954 
indirect(Node ** a,int n)955 Cell *indirect(Node **a, int n)	/* $( a[0] ) */
956 {
957 	Awkfloat val;
958 	Cell *x;
959 	int m;
960 	char *s;
961 
962 	x = execute(a[0]);
963 	val = getfval(x);	/* freebsd: defend against super large field numbers */
964 	if ((Awkfloat)INT_MAX < val)
965 		FATAL("trying to access out of range field %s", x->nval);
966 	m = (int) val;
967 	if (m == 0 && !is_number(s = getsval(x), NULL))	/* suspicion! */
968 		FATAL("illegal field $(%s), name \"%s\"", s, x->nval);
969 		/* BUG: can x->nval ever be null??? */
970 	tempfree(x);
971 	x = fieldadr(m);
972 	x->ctype = OCELL;	/* BUG?  why are these needed? */
973 	x->csub = CFLD;
974 	return(x);
975 }
976 
substr(Node ** a,int nnn)977 Cell *substr(Node **a, int nnn)		/* substr(a[0], a[1], a[2]) */
978 {
979 	int k, m, n;
980 	int mb, nb;
981 	char *s;
982 	int temp;
983 	Cell *x, *y, *z = NULL;
984 
985 	x = execute(a[0]);
986 	y = execute(a[1]);
987 	if (a[2] != NULL)
988 		z = execute(a[2]);
989 	s = getsval(x);
990 	k = u8_strlen(s) + 1;
991 	if (k <= 1) {
992 		tempfree(x);
993 		tempfree(y);
994 		if (a[2] != NULL) {
995 			tempfree(z);
996 		}
997 		x = gettemp();
998 		setsval(x, "");
999 		return(x);
1000 	}
1001 	m = (int) getfval(y);
1002 	if (m <= 0)
1003 		m = 1;
1004 	else if (m > k)
1005 		m = k;
1006 	tempfree(y);
1007 	if (a[2] != NULL) {
1008 		n = (int) getfval(z);
1009 		tempfree(z);
1010 	} else
1011 		n = k - 1;
1012 	if (n < 0)
1013 		n = 0;
1014 	else if (n > k - m)
1015 		n = k - m;
1016 	/* m is start, n is length from there */
1017 	DPRINTF("substr: m=%d, n=%d, s=%s\n", m, n, s);
1018 	y = gettemp();
1019 	mb = u8_char2byte(s, m-1); /* byte offset of start char in s */
1020 	nb = u8_char2byte(s, m-1+n);  /* byte offset of end+1 char in s */
1021 
1022 	temp = s[nb];	/* with thanks to John Linderman */
1023 	s[nb] = '\0';
1024 	setsval(y, s + mb);
1025 	s[nb] = temp;
1026 	tempfree(x);
1027 	return(y);
1028 }
1029 
sindex(Node ** a,int nnn)1030 Cell *sindex(Node **a, int nnn)		/* index(a[0], a[1]) */
1031 {
1032 	Cell *x, *y, *z;
1033 	char *s1, *s2, *p1, *p2, *q;
1034 	Awkfloat v = 0.0;
1035 
1036 	x = execute(a[0]);
1037 	s1 = getsval(x);
1038 	y = execute(a[1]);
1039 	s2 = getsval(y);
1040 
1041 	z = gettemp();
1042 	for (p1 = s1; *p1 != '\0'; p1++) {
1043 		for (q = p1, p2 = s2; *p2 != '\0' && *q == *p2; q++, p2++)
1044 			continue;
1045 		if (*p2 == '\0') {
1046 			/* v = (Awkfloat) (p1 - s1 + 1);	 origin 1 */
1047 
1048 		   /* should be a function: used in match() as well */
1049 			int i, len;
1050 			v = 0;
1051 			for (i = 0; i < p1-s1+1; i += len) {
1052 				len = u8_nextlen(s1+i);
1053 				v++;
1054 			}
1055 			break;
1056 		}
1057 	}
1058 	tempfree(x);
1059 	tempfree(y);
1060 	setfval(z, v);
1061 	return(z);
1062 }
1063 
has_utf8(char * s)1064 int has_utf8(char *s)	/* return 1 if s contains any utf-8 (2 bytes or more) character */
1065 {
1066 	int n;
1067 
1068 	for (n = 0; *s != 0; s += n) {
1069 		n = u8_nextlen(s);
1070 		if (n > 1)
1071 			return 1;
1072 	}
1073 	return 0;
1074 }
1075 
1076 #define	MAXNUMSIZE	50
1077 
format(char ** pbuf,int * pbufsize,const char * s,Node * a)1078 int format(char **pbuf, int *pbufsize, const char *s, Node *a)	/* printf-like conversions */
1079 {
1080 	char *fmt;
1081 	char *p, *t;
1082 	const char *os;
1083 	Cell *x;
1084 	int flag = 0, n;
1085 	int fmtwd; /* format width */
1086 	int fmtsz = recsize;
1087 	char *buf = *pbuf;
1088 	int bufsize = *pbufsize;
1089 #define FMTSZ(a)   (fmtsz - ((a) - fmt))
1090 #define BUFSZ(a)   (bufsize - ((a) - buf))
1091 
1092 	static bool first = true;
1093 	static bool have_a_format = false;
1094 
1095 	if (first) {
1096 		char xbuf[100];
1097 
1098 		snprintf(xbuf, sizeof(xbuf), "%a", 42.0);
1099 		have_a_format = (strcmp(xbuf, "0x1.5p+5") == 0);
1100 		first = false;
1101 	}
1102 
1103 	os = s;
1104 	p = buf;
1105 	if ((fmt = (char *) malloc(fmtsz)) == NULL)
1106 		FATAL("out of memory in format()");
1107 	while (*s) {
1108 		adjbuf(&buf, &bufsize, MAXNUMSIZE+1+p-buf, recsize, &p, "format1");
1109 		if (*s != '%') {
1110 			*p++ = *s++;
1111 			continue;
1112 		}
1113 		if (*(s+1) == '%') {
1114 			*p++ = '%';
1115 			s += 2;
1116 			continue;
1117 		}
1118 		fmtwd = atoi(s+1);
1119 		if (fmtwd < 0)
1120 			fmtwd = -fmtwd;
1121 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format2");
1122 		for (t = fmt; (*t++ = *s) != '\0'; s++) {
1123 			if (!adjbuf(&fmt, &fmtsz, MAXNUMSIZE+1+t-fmt, recsize, &t, "format3"))
1124 				FATAL("format item %.30s... ran format() out of memory", os);
1125 			/* Ignore size specifiers */
1126 			if (strchr("hjLlqtz", *s) != NULL) {	/* the ansi panoply */
1127 				t--;
1128 				continue;
1129 			}
1130 			if (isalpha((uschar)*s))
1131 				break;
1132 			if (*s == '$') {
1133 				FATAL("'$' not permitted in awk formats");
1134 			}
1135 			if (*s == '*') {
1136 				if (a == NULL) {
1137 					FATAL("not enough args in printf(%s)", os);
1138 				}
1139 				x = execute(a);
1140 				a = a->nnext;
1141 				snprintf(t - 1, FMTSZ(t - 1),
1142 				    "%d", fmtwd=(int) getfval(x));
1143 				if (fmtwd < 0)
1144 					fmtwd = -fmtwd;
1145 				adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format");
1146 				t = fmt + strlen(fmt);
1147 				tempfree(x);
1148 			}
1149 		}
1150 		*t = '\0';
1151 		if (fmtwd < 0)
1152 			fmtwd = -fmtwd;
1153 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format4");
1154 		switch (*s) {
1155 		case 'a': case 'A':
1156 			if (have_a_format)
1157 				flag = *s;
1158 			else
1159 				flag = 'f';
1160 			break;
1161 		case 'f': case 'e': case 'g': case 'E': case 'G':
1162 			flag = 'f';
1163 			break;
1164 		case 'd': case 'i': case 'o': case 'x': case 'X': case 'u':
1165 			flag = (*s == 'd' || *s == 'i') ? 'd' : 'u';
1166 			*(t-1) = 'j';
1167 			*t = *s;
1168 			*++t = '\0';
1169 			break;
1170 		case 's':
1171 			flag = 's';
1172 			break;
1173 		case 'c':
1174 			flag = 'c';
1175 			break;
1176 		default:
1177 			WARNING("weird printf conversion %s", fmt);
1178 			flag = '?';
1179 			break;
1180 		}
1181 		if (a == NULL)
1182 			FATAL("not enough args in printf(%s)", os);
1183 		x = execute(a);
1184 		a = a->nnext;
1185 		n = MAXNUMSIZE;
1186 		if (fmtwd > n)
1187 			n = fmtwd;
1188 		adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format5");
1189 		switch (flag) {
1190 		case '?':
1191 			snprintf(p, BUFSZ(p), "%s", fmt);	/* unknown, so dump it too */
1192 			t = getsval(x);
1193 			n = strlen(t);
1194 			if (fmtwd > n)
1195 				n = fmtwd;
1196 			adjbuf(&buf, &bufsize, 1+strlen(p)+n+p-buf, recsize, &p, "format6");
1197 			p += strlen(p);
1198 			snprintf(p, BUFSZ(p), "%s", t);
1199 			break;
1200 		case 'a':
1201 		case 'A':
1202 		case 'f':	snprintf(p, BUFSZ(p), fmt, getfval(x)); break;
1203 		case 'd':	snprintf(p, BUFSZ(p), fmt, (intmax_t) getfval(x)); break;
1204 		case 'u':	snprintf(p, BUFSZ(p), fmt, (uintmax_t) getfval(x)); break;
1205 
1206 		case 's': {
1207 			t = getsval(x);
1208 			n = strlen(t);
1209 			/* if simple format or no utf-8 in the string, sprintf works */
1210 			if (!has_utf8(t) || strcmp(fmt,"%s") == 0) {
1211 				if (fmtwd > n)
1212 					n = fmtwd;
1213 				if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7"))
1214 					FATAL("huge string/format (%d chars) in printf %.30s..." \
1215 						" ran format() out of memory", n, t);
1216 				snprintf(p, BUFSZ(p), fmt, t);
1217 				break;
1218 			}
1219 
1220 			/* get here if string has utf-8 chars and fmt is not plain %s */
1221 			/* "%-w.ps", where -, w and .p are all optional */
1222 			/* '0' before the w is a flag character */
1223 			/* fmt points at % */
1224 			int ljust = 0, wid = 0, prec = n, pad = 0;
1225 			char *f = fmt+1;
1226 			if (f[0] == '-') {
1227 				ljust = 1;
1228 				f++;
1229 			}
1230 			// flags '0' and '+' are recognized but skipped
1231 			if (f[0] == '0') {
1232 				f++;
1233 				if (f[0] == '+')
1234 					f++;
1235 			}
1236 			if (f[0] == '+') {
1237 				f++;
1238 				if (f[0] == '0')
1239 					f++;
1240 			}
1241 			if (isdigit(f[0])) { /* there is a wid */
1242 				wid = strtol(f, &f, 10);
1243 			}
1244 			if (f[0] == '.') { /* there is a .prec */
1245 				prec = strtol(++f, &f, 10);
1246 			}
1247 			if (prec > u8_strlen(t))
1248 				prec = u8_strlen(t);
1249 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1250 			int i, k, n;
1251 
1252 			if (ljust) { // print prec chars from t, then pad blanks
1253 				n = u8_char2byte(t, prec);
1254 				for (k = 0; k < n; k++) {
1255 					//putchar(t[k]);
1256 					*p++ = t[k];
1257 				}
1258 				for (i = 0; i < pad; i++) {
1259 					//printf(" ");
1260 					*p++ = ' ';
1261 				}
1262 			} else { // print pad blanks, then prec chars from t
1263 				for (i = 0; i < pad; i++) {
1264 					//printf(" ");
1265 					*p++ = ' ';
1266 				}
1267 				n = u8_char2byte(t, prec);
1268 				for (k = 0; k < n; k++) {
1269 					//putchar(t[k]);
1270 					*p++ = t[k];
1271 				}
1272 			}
1273 			*p = 0;
1274 			break;
1275 		}
1276 
1277                case 'c': {
1278 			/*
1279 			 * If a numeric value is given, awk should just turn
1280 			 * it into a character and print it:
1281 			 *      BEGIN { printf("%c\n", 65) }
1282 			 * prints "A".
1283 			 *
1284 			 * But what if the numeric value is > 128 and
1285 			 * represents a valid Unicode code point?!? We do
1286 			 * our best to convert it back into UTF-8. If we
1287 			 * can't, we output the encoding of the Unicode
1288 			 * "invalid character", 0xFFFD.
1289 			 */
1290 			if (isnum(x)) {
1291 				int charval = (int) getfval(x);
1292 
1293 				if (charval != 0) {
1294 					if (charval < 128 || awk_mb_cur_max == 1)
1295 						snprintf(p, BUFSZ(p), fmt, charval);
1296 					else {
1297 						// possible unicode character
1298 						size_t count;
1299 						char *bs = wide_char_to_byte_str(charval, &count);
1300 
1301 						if (bs == NULL)	{ // invalid character
1302 							// use unicode invalid character, 0xFFFD
1303 							static char invalid_char[] = "\357\277\275";
1304 							bs = invalid_char;
1305 							count = 3;
1306 						}
1307 						t = bs;
1308 						n = count;
1309 						goto format_percent_c;
1310 					}
1311 				} else {
1312 					*p++ = '\0'; /* explicit null byte */
1313 					*p = '\0';   /* next output will start here */
1314 				}
1315 				break;
1316 			}
1317 			t = getsval(x);
1318 			n = u8_nextlen(t);
1319 		format_percent_c:
1320 			if (n < 2) { /* not utf8 */
1321 				snprintf(p, BUFSZ(p), fmt, getsval(x)[0]);
1322 				break;
1323 			}
1324 
1325 			// utf8 character, almost same song and dance as for %s
1326 			int ljust = 0, wid = 0, prec = n, pad = 0;
1327 			char *f = fmt+1;
1328 			if (f[0] == '-') {
1329 				ljust = 1;
1330 				f++;
1331 			}
1332 			// flags '0' and '+' are recognized but skipped
1333 			if (f[0] == '0') {
1334 				f++;
1335 				if (f[0] == '+')
1336 					f++;
1337 			}
1338 			if (f[0] == '+') {
1339 				f++;
1340 				if (f[0] == '0')
1341 					f++;
1342 			}
1343 			if (isdigit(f[0])) { /* there is a wid */
1344 				wid = strtol(f, &f, 10);
1345 			}
1346 			if (f[0] == '.') { /* there is a .prec */
1347 				prec = strtol(++f, &f, 10);
1348 			}
1349 			if (prec > 1)           // %c --> only one character
1350 				prec = 1;
1351 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1352 			int i;
1353 
1354 			if (ljust) { // print one char from t, then pad blanks
1355 				for (i = 0; i < n; i++)
1356 					*p++ = t[i];
1357 				for (i = 0; i < pad; i++) {
1358 					//printf(" ");
1359 					*p++ = ' ';
1360 				}
1361 			} else { // print pad blanks, then prec chars from t
1362 				for (i = 0; i < pad; i++) {
1363 					//printf(" ");
1364 					*p++ = ' ';
1365 				}
1366 				for (i = 0; i < n; i++)
1367 					*p++ = t[i];
1368 			}
1369 			*p = 0;
1370 			break;
1371 		}
1372 		default:
1373 			FATAL("can't happen: bad conversion %c in format()", flag);
1374 		}
1375 
1376 		tempfree(x);
1377 		p += strlen(p);
1378 		s++;
1379 	}
1380 	*p = '\0';
1381 	free(fmt);
1382 	for ( ; a; a = a->nnext) {		/* evaluate any remaining args */
1383 		x = execute(a);
1384 		tempfree(x);
1385 	}
1386 	*pbuf = buf;
1387 	*pbufsize = bufsize;
1388 	return p - buf;
1389 }
1390 
awksprintf(Node ** a,int n)1391 Cell *awksprintf(Node **a, int n)		/* sprintf(a[0]) */
1392 {
1393 	Cell *x;
1394 	Node *y;
1395 	char *buf;
1396 	int bufsz=3*recsize;
1397 
1398 	if ((buf = (char *) malloc(bufsz)) == NULL)
1399 		FATAL("out of memory in awksprintf");
1400 	y = a[0]->nnext;
1401 	x = execute(a[0]);
1402 	if (format(&buf, &bufsz, getsval(x), y) == -1)
1403 		FATAL("sprintf string %.30s... too long.  can't happen.", buf);
1404 	tempfree(x);
1405 	x = gettemp();
1406 	x->sval = buf;
1407 	x->tval = STR;
1408 	return(x);
1409 }
1410 
awkprintf(Node ** a,int n)1411 Cell *awkprintf(Node **a, int n)		/* printf */
1412 {	/* a[0] is list of args, starting with format string */
1413 	/* a[1] is redirection operator, a[2] is redirection file */
1414 	FILE *fp;
1415 	Cell *x;
1416 	Node *y;
1417 	char *buf;
1418 	int len;
1419 	int bufsz=3*recsize;
1420 
1421 	if ((buf = (char *) malloc(bufsz)) == NULL)
1422 		FATAL("out of memory in awkprintf");
1423 	y = a[0]->nnext;
1424 	x = execute(a[0]);
1425 	if ((len = format(&buf, &bufsz, getsval(x), y)) == -1)
1426 		FATAL("printf string %.30s... too long.  can't happen.", buf);
1427 	tempfree(x);
1428 	if (a[1] == NULL) {
1429 		/* fputs(buf, stdout); */
1430 		fwrite(buf, len, 1, stdout);
1431 		if (ferror(stdout))
1432 			FATAL("write error on stdout");
1433 	} else {
1434 		fp = redirect(ptoi(a[1]), a[2]);
1435 		/* fputs(buf, fp); */
1436 		fwrite(buf, len, 1, fp);
1437 		fflush(fp);
1438 		if (ferror(fp))
1439 			FATAL("write error on %s", filename(fp));
1440 	}
1441 	free(buf);
1442 	return(True);
1443 }
1444 
arith(Node ** a,int n)1445 Cell *arith(Node **a, int n)	/* a[0] + a[1], etc.  also -a[0] */
1446 {
1447 	Awkfloat i, j = 0;
1448 	double v;
1449 	Cell *x, *y, *z;
1450 
1451 	x = execute(a[0]);
1452 	i = getfval(x);
1453 	tempfree(x);
1454 	if (n != UMINUS && n != UPLUS) {
1455 		y = execute(a[1]);
1456 		j = getfval(y);
1457 		tempfree(y);
1458 	}
1459 	z = gettemp();
1460 	switch (n) {
1461 	case ADD:
1462 		i += j;
1463 		break;
1464 	case MINUS:
1465 		i -= j;
1466 		break;
1467 	case MULT:
1468 		i *= j;
1469 		break;
1470 	case DIVIDE:
1471 		if (j == 0)
1472 			FATAL("division by zero");
1473 		i /= j;
1474 		break;
1475 	case MOD:
1476 		if (j == 0)
1477 			FATAL("division by zero in mod");
1478 		modf(i/j, &v);
1479 		i = i - j * v;
1480 		break;
1481 	case UMINUS:
1482 		i = -i;
1483 		break;
1484 	case UPLUS: /* handled by getfval(), above */
1485 		break;
1486 	case POWER:
1487 		if (j >= 0 && modf(j, &v) == 0.0)	/* pos integer exponent */
1488 			i = ipow(i, (int) j);
1489                else {
1490 			errno = 0;
1491 			i = errcheck(pow(i, j), "pow");
1492                }
1493 		break;
1494 	default:	/* can't happen */
1495 		FATAL("illegal arithmetic operator %d", n);
1496 	}
1497 	setfval(z, i);
1498 	return(z);
1499 }
1500 
ipow(double x,int n)1501 double ipow(double x, int n)	/* x**n.  ought to be done by pow, but isn't always */
1502 {
1503 	double v;
1504 
1505 	if (n <= 0)
1506 		return 1;
1507 	v = ipow(x, n/2);
1508 	if (n % 2 == 0)
1509 		return v * v;
1510 	else
1511 		return x * v * v;
1512 }
1513 
incrdecr(Node ** a,int n)1514 Cell *incrdecr(Node **a, int n)		/* a[0]++, etc. */
1515 {
1516 	Cell *x, *z;
1517 	int k;
1518 	Awkfloat xf;
1519 
1520 	x = execute(a[0]);
1521 	xf = getfval(x);
1522 	k = (n == PREINCR || n == POSTINCR) ? 1 : -1;
1523 	if (n == PREINCR || n == PREDECR) {
1524 		setfval(x, xf + k);
1525 		return(x);
1526 	}
1527 	z = gettemp();
1528 	setfval(z, xf);
1529 	setfval(x, xf + k);
1530 	tempfree(x);
1531 	return(z);
1532 }
1533 
assign(Node ** a,int n)1534 Cell *assign(Node **a, int n)	/* a[0] = a[1], a[0] += a[1], etc. */
1535 {		/* this is subtle; don't muck with it. */
1536 	Cell *x, *y;
1537 	Awkfloat xf, yf;
1538 	double v;
1539 
1540 	y = execute(a[1]);
1541 	x = execute(a[0]);
1542 	if (n == ASSIGN) {	/* ordinary assignment */
1543 		if (x == y && !(x->tval & (FLD|REC)) && x != nfloc)
1544 			;	/* self-assignment: leave alone unless it's a field or NF */
1545 		else if ((y->tval & (STR|NUM)) == (STR|NUM)) {
1546 			yf = getfval(y);
1547 			setsval(x, getsval(y));
1548 			x->fval = yf;
1549 			x->tval |= NUM;
1550 		}
1551 		else if (isstr(y))
1552 			setsval(x, getsval(y));
1553 		else if (isnum(y))
1554 			setfval(x, getfval(y));
1555 		else
1556 			funnyvar(y, "read value of");
1557 		tempfree(y);
1558 		return(x);
1559 	}
1560 	xf = getfval(x);
1561 	yf = getfval(y);
1562 	switch (n) {
1563 	case ADDEQ:
1564 		xf += yf;
1565 		break;
1566 	case SUBEQ:
1567 		xf -= yf;
1568 		break;
1569 	case MULTEQ:
1570 		xf *= yf;
1571 		break;
1572 	case DIVEQ:
1573 		if (yf == 0)
1574 			FATAL("division by zero in /=");
1575 		xf /= yf;
1576 		break;
1577 	case MODEQ:
1578 		if (yf == 0)
1579 			FATAL("division by zero in %%=");
1580 		modf(xf/yf, &v);
1581 		xf = xf - yf * v;
1582 		break;
1583 	case POWEQ:
1584 		if (yf >= 0 && modf(yf, &v) == 0.0)	/* pos integer exponent */
1585 			xf = ipow(xf, (int) yf);
1586                else {
1587 			errno = 0;
1588 			xf = errcheck(pow(xf, yf), "pow");
1589                }
1590 		break;
1591 	default:
1592 		FATAL("illegal assignment operator %d", n);
1593 		break;
1594 	}
1595 	tempfree(y);
1596 	setfval(x, xf);
1597 	return(x);
1598 }
1599 
cat(Node ** a,int q)1600 Cell *cat(Node **a, int q)	/* a[0] cat a[1] */
1601 {
1602 	Cell *x, *y, *z;
1603 	int n1, n2;
1604 	char *s = NULL;
1605 	int ssz = 0;
1606 
1607 	x = execute(a[0]);
1608 	n1 = strlen(getsval(x));
1609 	adjbuf(&s, &ssz, n1 + 1, recsize, 0, "cat1");
1610 	memcpy(s, x->sval, n1);
1611 
1612 	tempfree(x);
1613 
1614 	y = execute(a[1]);
1615 	n2 = strlen(getsval(y));
1616 	adjbuf(&s, &ssz, n1 + n2 + 1, recsize, 0, "cat2");
1617 	memcpy(s + n1, y->sval, n2);
1618 	s[n1 + n2] = '\0';
1619 
1620 	tempfree(y);
1621 
1622 	z = gettemp();
1623 	z->sval = s;
1624 	z->tval = STR;
1625 
1626 	return(z);
1627 }
1628 
pastat(Node ** a,int n)1629 Cell *pastat(Node **a, int n)	/* a[0] { a[1] } */
1630 {
1631 	Cell *x;
1632 
1633 	if (a[0] == NULL)
1634 		x = execute(a[1]);
1635 	else {
1636 		x = execute(a[0]);
1637 		if (istrue(x)) {
1638 			tempfree(x);
1639 			x = execute(a[1]);
1640 		}
1641 	}
1642 	return x;
1643 }
1644 
dopa2(Node ** a,int n)1645 Cell *dopa2(Node **a, int n)	/* a[0], a[1] { a[2] } */
1646 {
1647 	Cell *x;
1648 	int pair;
1649 
1650 	pair = ptoi(a[3]);
1651 	if (pairstack[pair] == 0) {
1652 		x = execute(a[0]);
1653 		if (istrue(x))
1654 			pairstack[pair] = 1;
1655 		tempfree(x);
1656 	}
1657 	if (pairstack[pair] == 1) {
1658 		x = execute(a[1]);
1659 		if (istrue(x))
1660 			pairstack[pair] = 0;
1661 		tempfree(x);
1662 		x = execute(a[2]);
1663 		return(x);
1664 	}
1665 	return(False);
1666 }
1667 
split(Node ** a,int nnn)1668 Cell *split(Node **a, int nnn)	/* split(a[0], a[1], a[2]); a[3] is type */
1669 {
1670 	Cell *x = NULL, *y, *ap;
1671 	const char *s, *origs, *t;
1672 	const char *fs = NULL;
1673 	char *origfs = NULL;
1674 	int sep;
1675 	char temp, num[50];
1676 	int n, tempstat, arg3type;
1677 	int j;
1678 	double result;
1679 
1680 	y = execute(a[0]);	/* source string */
1681 	origs = s = strdup(getsval(y));
1682 	tempfree(y);
1683 	arg3type = ptoi(a[3]);
1684 	if (a[2] == NULL) {		/* BUG: CSV should override implicit fs but not explicit */
1685 		fs = getsval(fsloc);
1686 	} else if (arg3type == STRING) {	/* split(str,arr,"string") */
1687 		x = execute(a[2]);
1688 		fs = origfs = strdup(getsval(x));
1689 		tempfree(x);
1690 	} else if (arg3type == REGEXPR) {
1691 		fs = "(regexpr)";	/* split(str,arr,/regexpr/) */
1692 	} else {
1693 		FATAL("illegal type of split");
1694 	}
1695 	sep = *fs;
1696 	ap = execute(a[1]);	/* array name */
1697 /* BUG 7/26/22: this appears not to reset array: see C1/asplit */
1698 	freesymtab(ap);
1699 	DPRINTF("split: s=|%s|, a=%s, sep=|%s|\n", s, NN(ap->nval), fs);
1700 	ap->tval &= ~STR;
1701 	ap->tval |= ARR;
1702 	ap->sval = (char *) makesymtab(NSYMTAB);
1703 
1704 	n = 0;
1705         if (arg3type == REGEXPR && strlen((char*)((fa*)a[2])->restr) == 0) {
1706 		/* split(s, a, //); have to arrange that it looks like empty sep */
1707 		arg3type = 0;
1708 		fs = "";
1709 		sep = 0;
1710 	}
1711 	if (*s != '\0' && (strlen(fs) > 1 || arg3type == REGEXPR)) {	/* reg expr */
1712 		fa *pfa;
1713 		if (arg3type == REGEXPR) {	/* it's ready already */
1714 			pfa = (fa *) a[2];
1715 		} else {
1716 			pfa = makedfa(fs, 1);
1717 		}
1718 		if (nematch(pfa,s)) {
1719 			tempstat = pfa->initstat;
1720 			pfa->initstat = 2;
1721 			do {
1722 				n++;
1723 				snprintf(num, sizeof(num), "%d", n);
1724 				temp = *patbeg;
1725 				setptr(patbeg, '\0');
1726 				if (is_number(s, & result))
1727 					setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1728 				else
1729 					setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1730 				setptr(patbeg, temp);
1731 				s = patbeg + patlen;
1732 				if (*(patbeg+patlen-1) == '\0' || *s == '\0') {
1733 					n++;
1734 					snprintf(num, sizeof(num), "%d", n);
1735 					setsymtab(num, "", 0.0, STR, (Array *) ap->sval);
1736 					pfa->initstat = tempstat;
1737 					goto spdone;
1738 				}
1739 			} while (nematch(pfa,s));
1740 			pfa->initstat = tempstat; 	/* bwk: has to be here to reset */
1741 							/* cf gsub and refldbld */
1742 		}
1743 		n++;
1744 		snprintf(num, sizeof(num), "%d", n);
1745 		if (is_number(s, & result))
1746 			setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1747 		else
1748 			setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1749   spdone:
1750 		pfa = NULL;
1751 
1752 	} else if (a[2] == NULL && CSV) {	/* CSV only if no explicit separator */
1753 		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
1754 		for (;;) {
1755 			char *fr = newt;
1756 			n++;
1757 			if (*s == '"' ) { /* start of "..." */
1758 				for (s++ ; *s != '\0'; ) {
1759 					if (*s == '"' && s[1] != '\0' && s[1] == '"') {
1760 						s += 2; /* doubled quote */
1761 						*fr++ = '"';
1762 					} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {
1763 						s++; /* skip over closing quote */
1764 						break;
1765 					} else {
1766 						*fr++ = *s++;
1767 					}
1768 				}
1769 				*fr++ = 0;
1770 			} else {	/* unquoted field */
1771 				while (*s != ',' && *s != '\0')
1772 					*fr++ = *s++;
1773 				*fr++ = 0;
1774 			}
1775 			snprintf(num, sizeof(num), "%d", n);
1776 			if (is_number(newt, &result))
1777 				setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
1778 			else
1779 				setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
1780 			if (*s++ == '\0')
1781 				break;
1782 		}
1783 		free(newt);
1784 
1785 	} else if (!CSV && sep == ' ') { /* usual case: split on white space */
1786 		for (n = 0; ; ) {
1787 #define ISWS(c)	((c) == ' ' || (c) == '\t' || (c) == '\n')
1788 			while (ISWS(*s))
1789 				s++;
1790 			if (*s == '\0')
1791 				break;
1792 			n++;
1793 			t = s;
1794 			do
1795 				s++;
1796 			while (*s != '\0' && !ISWS(*s));
1797 			temp = *s;
1798 			setptr(s, '\0');
1799 			snprintf(num, sizeof(num), "%d", n);
1800 			if (is_number(t, & result))
1801 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1802 			else
1803 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1804 			setptr(s, temp);
1805 			if (*s != '\0')
1806 				s++;
1807 		}
1808 
1809 	} else if (sep == 0) {	/* new: split(s, a, "") => 1 char/elem */
1810 		for (n = 0; *s != '\0'; s += u8_nextlen(s)) {
1811 			char buf[10];
1812 			n++;
1813 			snprintf(num, sizeof(num), "%d", n);
1814 
1815 			for (j = 0; j < u8_nextlen(s); j++) {
1816 				buf[j] = s[j];
1817 			}
1818 			buf[j] = '\0';
1819 
1820 			if (isdigit((uschar)buf[0]))
1821 				setsymtab(num, buf, atof(buf), STR|NUM, (Array *) ap->sval);
1822 			else
1823 				setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);
1824 		}
1825 
1826 	} else if (*s != '\0') {  /* some random single character */
1827 		for (;;) {
1828 			n++;
1829 			t = s;
1830 			while (*s != sep && *s != '\0')
1831 				s++;
1832 			temp = *s;
1833 			setptr(s, '\0');
1834 			snprintf(num, sizeof(num), "%d", n);
1835 			if (is_number(t, & result))
1836 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1837 			else
1838 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1839 			setptr(s, temp);
1840 			if (*s++ == '\0')
1841 				break;
1842 		}
1843 	}
1844 	tempfree(ap);
1845 	xfree(origs);
1846 	xfree(origfs);
1847 	x = gettemp();
1848 	x->tval = NUM;
1849 	x->fval = n;
1850 	return(x);
1851 }
1852 
condexpr(Node ** a,int n)1853 Cell *condexpr(Node **a, int n)	/* a[0] ? a[1] : a[2] */
1854 {
1855 	Cell *x;
1856 
1857 	x = execute(a[0]);
1858 	if (istrue(x)) {
1859 		tempfree(x);
1860 		x = execute(a[1]);
1861 	} else {
1862 		tempfree(x);
1863 		x = execute(a[2]);
1864 	}
1865 	return(x);
1866 }
1867 
ifstat(Node ** a,int n)1868 Cell *ifstat(Node **a, int n)	/* if (a[0]) a[1]; else a[2] */
1869 {
1870 	Cell *x;
1871 
1872 	x = execute(a[0]);
1873 	if (istrue(x)) {
1874 		tempfree(x);
1875 		x = execute(a[1]);
1876 	} else if (a[2] != NULL) {
1877 		tempfree(x);
1878 		x = execute(a[2]);
1879 	}
1880 	return(x);
1881 }
1882 
whilestat(Node ** a,int n)1883 Cell *whilestat(Node **a, int n)	/* while (a[0]) a[1] */
1884 {
1885 	Cell *x;
1886 
1887 	for (;;) {
1888 		x = execute(a[0]);
1889 		if (!istrue(x))
1890 			return(x);
1891 		tempfree(x);
1892 		x = execute(a[1]);
1893 		if (isbreak(x)) {
1894 			x = True;
1895 			return(x);
1896 		}
1897 		if (isnext(x) || isexit(x) || isret(x))
1898 			return(x);
1899 		tempfree(x);
1900 	}
1901 }
1902 
dostat(Node ** a,int n)1903 Cell *dostat(Node **a, int n)	/* do a[0]; while(a[1]) */
1904 {
1905 	Cell *x;
1906 
1907 	for (;;) {
1908 		x = execute(a[0]);
1909 		if (isbreak(x))
1910 			return True;
1911 		if (isnext(x) || isexit(x) || isret(x))
1912 			return(x);
1913 		tempfree(x);
1914 		x = execute(a[1]);
1915 		if (!istrue(x))
1916 			return(x);
1917 		tempfree(x);
1918 	}
1919 }
1920 
forstat(Node ** a,int n)1921 Cell *forstat(Node **a, int n)	/* for (a[0]; a[1]; a[2]) a[3] */
1922 {
1923 	Cell *x;
1924 
1925 	x = execute(a[0]);
1926 	tempfree(x);
1927 	for (;;) {
1928 		if (a[1]!=NULL) {
1929 			x = execute(a[1]);
1930 			if (!istrue(x)) return(x);
1931 			else tempfree(x);
1932 		}
1933 		x = execute(a[3]);
1934 		if (isbreak(x))		/* turn off break */
1935 			return True;
1936 		if (isnext(x) || isexit(x) || isret(x))
1937 			return(x);
1938 		tempfree(x);
1939 		x = execute(a[2]);
1940 		tempfree(x);
1941 	}
1942 }
1943 
instat(Node ** a,int n)1944 Cell *instat(Node **a, int n)	/* for (a[0] in a[1]) a[2] */
1945 {
1946 	Cell *x, *vp, *arrayp, *cp, *ncp;
1947 	Array *tp;
1948 	int i;
1949 
1950 	vp = execute(a[0]);
1951 	arrayp = execute(a[1]);
1952 	if (!isarr(arrayp)) {
1953 		return True;
1954 	}
1955 	tp = (Array *) arrayp->sval;
1956 	tempfree(arrayp);
1957 	for (i = 0; i < tp->size; i++) {	/* this routine knows too much */
1958 		for (cp = tp->tab[i]; cp != NULL; cp = ncp) {
1959 			setsval(vp, cp->nval);
1960 			ncp = cp->cnext;
1961 			x = execute(a[2]);
1962 			if (isbreak(x)) {
1963 				tempfree(vp);
1964 				return True;
1965 			}
1966 			if (isnext(x) || isexit(x) || isret(x)) {
1967 				tempfree(vp);
1968 				return(x);
1969 			}
1970 			tempfree(x);
1971 		}
1972 	}
1973 	return True;
1974 }
1975 
nawk_convert(const char * s,int (* fun_c)(int),wint_t (* fun_wc)(wint_t))1976 static char *nawk_convert(const char *s, int (*fun_c)(int),
1977     wint_t (*fun_wc)(wint_t))
1978 {
1979 	char *buf      = NULL;
1980 	char *pbuf     = NULL;
1981 	const char *ps = NULL;
1982 	size_t n       = 0;
1983 	wchar_t wc;
1984 	const size_t sz = awk_mb_cur_max;
1985 	int unused;
1986 
1987 	if (sz == 1) {
1988 		buf = tostring(s);
1989 
1990 		for (pbuf = buf; *pbuf; pbuf++)
1991 			*pbuf = fun_c((uschar)*pbuf);
1992 
1993 		return buf;
1994 	} else {
1995 		/* upper/lower character may be shorter/longer */
1996 		buf = tostringN(s, strlen(s) * sz + 1);
1997 
1998 		(void) mbtowc(NULL, NULL, 0);	/* reset internal state */
1999 		/*
2000 		 * Reset internal state here too.
2001 		 * Assign result to avoid a compiler warning. (Casting to void
2002 		 * doesn't work.)
2003 		 * Increment said variable to avoid a different warning.
2004 		 */
2005 		unused = wctomb(NULL, L'\0');
2006 		unused++;
2007 
2008 		ps   = s;
2009 		pbuf = buf;
2010 		while (n = mbtowc(&wc, ps, sz),
2011 		       n > 0 && n != (size_t)-1 && n != (size_t)-2)
2012 		{
2013 			ps += n;
2014 
2015 			n = wctomb(pbuf, fun_wc(wc));
2016 			if (n == (size_t)-1)
2017 				FATAL("illegal wide character %s", s);
2018 
2019 			pbuf += n;
2020 		}
2021 
2022 		*pbuf = '\0';
2023 
2024 		if (n)
2025 			FATAL("illegal byte sequence %s", s);
2026 
2027 		return buf;
2028 	}
2029 }
2030 
2031 #ifdef __DJGPP__
towupper(wint_t wc)2032 static wint_t towupper(wint_t wc)
2033 {
2034 	if (wc >= 0 && wc < 256)
2035 		return toupper(wc & 0xFF);
2036 
2037 	return wc;
2038 }
2039 
towlower(wint_t wc)2040 static wint_t towlower(wint_t wc)
2041 {
2042 	if (wc >= 0 && wc < 256)
2043 		return tolower(wc & 0xFF);
2044 
2045 	return wc;
2046 }
2047 #endif
2048 
nawk_toupper(const char * s)2049 static char *nawk_toupper(const char *s)
2050 {
2051 	return nawk_convert(s, toupper, towupper);
2052 }
2053 
nawk_tolower(const char * s)2054 static char *nawk_tolower(const char *s)
2055 {
2056 	return nawk_convert(s, tolower, towlower);
2057 }
2058 
2059 
2060 
bltin(Node ** a,int n)2061 Cell *bltin(Node **a, int n)	/* builtin functions. a[0] is type, a[1] is arg list */
2062 {
2063 	Cell *x, *y;
2064 	Awkfloat u = 0;
2065 	int t, sz;
2066 	Awkfloat tmp;
2067 	char *buf, *fmt;
2068 	Node *nextarg;
2069 	FILE *fp;
2070 	int status = 0;
2071 	time_t tv;
2072 	struct tm *tm, tmbuf;
2073 	int estatus = 0;
2074 
2075 	t = ptoi(a[0]);
2076 	x = execute(a[1]);
2077 	nextarg = a[1]->nnext;
2078 	switch (t) {
2079 	case FLENGTH:
2080 		if (isarr(x))
2081 			u = ((Array *) x->sval)->nelem;	/* GROT.  should be function*/
2082 		else
2083 			u = u8_strlen(getsval(x));
2084 		break;
2085 	case FLOG:
2086 		errno = 0;
2087 		u = errcheck(log(getfval(x)), "log");
2088 		break;
2089 	case FINT:
2090 		modf(getfval(x), &u); break;
2091 	case FEXP:
2092 		errno = 0;
2093 		u = errcheck(exp(getfval(x)), "exp");
2094 		break;
2095 	case FSQRT:
2096 		errno = 0;
2097 		u = errcheck(sqrt(getfval(x)), "sqrt");
2098 		break;
2099 	case FSIN:
2100 		u = sin(getfval(x)); break;
2101 	case FCOS:
2102 		u = cos(getfval(x)); break;
2103 	case FATAN:
2104 		if (nextarg == NULL) {
2105 			WARNING("atan2 requires two arguments; returning 1.0");
2106 			u = 1.0;
2107 		} else {
2108 			y = execute(a[1]->nnext);
2109 			u = atan2(getfval(x), getfval(y));
2110 			tempfree(y);
2111 			nextarg = nextarg->nnext;
2112 		}
2113 		break;
2114 	case FCOMPL:
2115 		u = ~((int)getfval(x));
2116 		break;
2117 	case FAND:
2118 		if (nextarg == 0) {
2119 			WARNING("and requires two arguments; returning 0");
2120 			u = 0;
2121 			break;
2122 		}
2123 		y = execute(a[1]->nnext);
2124 		u = ((int)getfval(x)) & ((int)getfval(y));
2125 		tempfree(y);
2126 		nextarg = nextarg->nnext;
2127 		break;
2128 	case FFOR:
2129 		if (nextarg == 0) {
2130 			WARNING("or requires two arguments; returning 0");
2131 			u = 0;
2132 			break;
2133 		}
2134 		y = execute(a[1]->nnext);
2135 		u = ((int)getfval(x)) | ((int)getfval(y));
2136 		tempfree(y);
2137 		nextarg = nextarg->nnext;
2138 		break;
2139 	case FXOR:
2140 		if (nextarg == 0) {
2141 			WARNING("xor requires two arguments; returning 0");
2142 			u = 0;
2143 			break;
2144 		}
2145 		y = execute(a[1]->nnext);
2146 		u = ((int)getfval(x)) ^ ((int)getfval(y));
2147 		tempfree(y);
2148 		nextarg = nextarg->nnext;
2149 		break;
2150 	case FLSHIFT:
2151 		if (nextarg == 0) {
2152 			WARNING("lshift requires two arguments; returning 0");
2153 			u = 0;
2154 			break;
2155 		}
2156 		y = execute(a[1]->nnext);
2157 		u = ((int)getfval(x)) << ((int)getfval(y));
2158 		tempfree(y);
2159 		nextarg = nextarg->nnext;
2160 		break;
2161 	case FRSHIFT:
2162 		if (nextarg == 0) {
2163 			WARNING("rshift requires two arguments; returning 0");
2164 			u = 0;
2165 			break;
2166 		}
2167 		y = execute(a[1]->nnext);
2168 		u = ((int)getfval(x)) >> ((int)getfval(y));
2169 		tempfree(y);
2170 		nextarg = nextarg->nnext;
2171 		break;
2172 	case FSYSTEM:
2173 		fflush(stdout);		/* in case something is buffered already */
2174 		estatus = status = system(getsval(x));
2175 		if (status != -1) {
2176 			if (WIFEXITED(status)) {
2177 				estatus = WEXITSTATUS(status);
2178 			} else if (WIFSIGNALED(status)) {
2179 				estatus = WTERMSIG(status) + 256;
2180 #ifdef WCOREDUMP
2181 				if (WCOREDUMP(status))
2182 					estatus += 256;
2183 #endif
2184 			} else	/* something else?!? */
2185 				estatus = 0;
2186 		}
2187 		/* else estatus was set to -1 */
2188 		u = estatus;
2189 		break;
2190 	case FRAND:
2191 		/* random() returns numbers in [0..2^31-1]
2192 		 * in order to get a number in [0, 1), divide it by 2^31
2193 		 */
2194 		u = (Awkfloat) random() / (0x7fffffffL + 0x1UL);
2195 		break;
2196 	case FSRAND:
2197 		if (isrec(x))	/* no argument provided */
2198 			u = time((time_t *)0);
2199 		else
2200 			u = getfval(x);
2201 		tmp = u;
2202 		srandom((unsigned long) u);
2203 		u = srand_seed;
2204 		srand_seed = tmp;
2205 		break;
2206 	case FTOUPPER:
2207 	case FTOLOWER:
2208 		if (t == FTOUPPER)
2209 			buf = nawk_toupper(getsval(x));
2210 		else
2211 			buf = nawk_tolower(getsval(x));
2212 		tempfree(x);
2213 		x = gettemp();
2214 		setsval(x, buf);
2215 		free(buf);
2216 		return x;
2217 	case FFLUSH:
2218 		if (isrec(x) || strlen(getsval(x)) == 0) {
2219 			flush_all();	/* fflush() or fflush("") -> all */
2220 			u = 0;
2221 		} else if ((fp = openfile(FFLUSH, getsval(x), NULL)) == NULL)
2222 			u = EOF;
2223 		else
2224 			u = fflush(fp);
2225 		break;
2226 	case FMKTIME:
2227 		memset(&tmbuf, 0, sizeof(tmbuf));
2228 		tm = &tmbuf;
2229 		t = sscanf(getsval(x), "%d %d %d %d %d %d %d",
2230 		    &tm->tm_year, &tm->tm_mon, &tm->tm_mday, &tm->tm_hour,
2231 		    &tm->tm_min, &tm->tm_sec, &tm->tm_isdst);
2232 		switch (t) {
2233 		case 6:
2234 			tm->tm_isdst = -1;	/* let mktime figure it out */
2235 			/* FALLTHROUGH */
2236 		case 7:
2237 			tm->tm_year -= 1900;
2238 			tm->tm_mon--;
2239 			u = mktime(tm);
2240 			break;
2241 		default:
2242 			u = -1;
2243 			break;
2244 		}
2245 		break;
2246 	case FSYSTIME:
2247 		u = time((time_t *) 0);
2248 		break;
2249 	case FSTRFTIME:
2250 		/* strftime([format [,timestamp]]) */
2251 		if (nextarg) {
2252 			y = execute(nextarg);
2253 			nextarg = nextarg->nnext;
2254 			tv = (time_t) getfval(y);
2255 			tempfree(y);
2256 		} else
2257 			tv = time((time_t *) 0);
2258 		tm = localtime(&tv);
2259 		if (tm == NULL)
2260 			FATAL("bad time %ld", (long)tv);
2261 
2262 		if (isrec(x)) {
2263 			/* format argument not provided, use default */
2264 			fmt = tostring("%a %b %d %H:%M:%S %Z %Y");
2265 		} else
2266 			fmt = tostring(getsval(x));
2267 
2268 		sz = 32;
2269 		buf = NULL;
2270 		do {
2271 			if ((buf = realloc(buf, (sz *= 2))) == NULL)
2272 				FATAL("out of memory in strftime");
2273 		} while (strftime(buf, sz, fmt, tm) == 0 && fmt[0] != '\0');
2274 
2275 		y = gettemp();
2276 		setsval(y, buf);
2277 		free(fmt);
2278 		free(buf);
2279 
2280 		return y;
2281 	default:	/* can't happen */
2282 		FATAL("illegal function type %d", t);
2283 		break;
2284 	}
2285 	tempfree(x);
2286 	x = gettemp();
2287 	setfval(x, u);
2288 	if (nextarg != NULL) {
2289 		WARNING("warning: function has too many arguments");
2290 		for ( ; nextarg; nextarg = nextarg->nnext) {
2291 			y = execute(nextarg);
2292 			tempfree(y);
2293 		}
2294 	}
2295 	return(x);
2296 }
2297 
printstat(Node ** a,int n)2298 Cell *printstat(Node **a, int n)	/* print a[0] */
2299 {
2300 	Node *x;
2301 	Cell *y;
2302 	FILE *fp;
2303 
2304 	if (a[1] == NULL)	/* a[1] is redirection operator, a[2] is file */
2305 		fp = stdout;
2306 	else
2307 		fp = redirect(ptoi(a[1]), a[2]);
2308 	for (x = a[0]; x != NULL; x = x->nnext) {
2309 		y = execute(x);
2310 		fputs(getpssval(y), fp);
2311 		tempfree(y);
2312 		if (x->nnext == NULL)
2313 			fputs(getsval(orsloc), fp);
2314 		else
2315 			fputs(getsval(ofsloc), fp);
2316 	}
2317 	if (a[1] != NULL)
2318 		fflush(fp);
2319 	if (ferror(fp))
2320 		FATAL("write error on %s", filename(fp));
2321 	return(True);
2322 }
2323 
nullproc(Node ** a,int n)2324 Cell *nullproc(Node **a, int n)
2325 {
2326 	return 0;
2327 }
2328 
2329 
redirect(int a,Node * b)2330 FILE *redirect(int a, Node *b)	/* set up all i/o redirections */
2331 {
2332 	FILE *fp;
2333 	Cell *x;
2334 	char *fname;
2335 
2336 	x = execute(b);
2337 	fname = getsval(x);
2338 	fp = openfile(a, fname, NULL);
2339 	if (fp == NULL)
2340 		FATAL("can't open file %s", fname);
2341 	tempfree(x);
2342 	return fp;
2343 }
2344 
2345 struct files {
2346 	FILE	*fp;
2347 	const char	*fname;
2348 	int	mode;	/* '|', 'a', 'w' => LE/LT, GT */
2349 } *files;
2350 
2351 size_t nfiles;
2352 
stdinit(void)2353 static void stdinit(void)	/* in case stdin, etc., are not constants */
2354 {
2355 	nfiles = FOPEN_MAX;
2356 	files = (struct files *) calloc(nfiles, sizeof(*files));
2357 	if (files == NULL)
2358 		FATAL("can't allocate file memory for %zu files", nfiles);
2359         files[0].fp = stdin;
2360 	files[0].fname = tostring("/dev/stdin");
2361 	files[0].mode = LT;
2362         files[1].fp = stdout;
2363 	files[1].fname = tostring("/dev/stdout");
2364 	files[1].mode = GT;
2365         files[2].fp = stderr;
2366 	files[2].fname = tostring("/dev/stderr");
2367 	files[2].mode = GT;
2368 }
2369 
openfile(int a,const char * us,bool * pnewflag)2370 FILE *openfile(int a, const char *us, bool *pnewflag)
2371 {
2372 	const char *s = us;
2373 	size_t i;
2374 	int m;
2375 	FILE *fp = NULL;
2376 
2377 	if (*s == '\0')
2378 		FATAL("null file name in print or getline");
2379 	for (i = 0; i < nfiles; i++)
2380 		if (files[i].fname && strcmp(s, files[i].fname) == 0 &&
2381 		    (a == files[i].mode || (a==APPEND && files[i].mode==GT) ||
2382 		     a == FFLUSH)) {
2383 			if (pnewflag)
2384 				*pnewflag = false;
2385 			return files[i].fp;
2386 		}
2387 	if (a == FFLUSH)	/* didn't find it, so don't create it! */
2388 		return NULL;
2389 
2390 	for (i = 0; i < nfiles; i++)
2391 		if (files[i].fp == NULL)
2392 			break;
2393 	if (i >= nfiles) {
2394 		struct files *nf;
2395 		size_t nnf = nfiles + FOPEN_MAX;
2396 		nf = (struct files *) realloc(files, nnf * sizeof(*nf));
2397 		if (nf == NULL)
2398 			FATAL("cannot grow files for %s and %zu files", s, nnf);
2399 		memset(&nf[nfiles], 0, FOPEN_MAX * sizeof(*nf));
2400 		nfiles = nnf;
2401 		files = nf;
2402 	}
2403 	fflush(stdout);	/* force a semblance of order */
2404 	m = a;
2405 	if (a == GT) {
2406 		fp = fopen(s, "w");
2407 	} else if (a == APPEND) {
2408 		fp = fopen(s, "a");
2409 		m = GT;	/* so can mix > and >> */
2410 	} else if (a == '|') {	/* output pipe */
2411 		fp = popen(s, "w");
2412 	} else if (a == LE) {	/* input pipe */
2413 		fp = popen(s, "r");
2414 	} else if (a == LT) {	/* getline <file */
2415 		fp = strcmp(s, "-") == 0 ? stdin : fopen(s, "r");	/* "-" is stdin */
2416 	} else	/* can't happen */
2417 		FATAL("illegal redirection %d", a);
2418 	if (fp != NULL) {
2419 		files[i].fname = tostring(s);
2420 		files[i].fp = fp;
2421 		files[i].mode = m;
2422 		if (pnewflag)
2423 			*pnewflag = true;
2424 		if (fp != stdin && fp != stdout && fp != stderr)
2425 			(void) fcntl(fileno(fp), F_SETFD, FD_CLOEXEC);
2426 	}
2427 	return fp;
2428 }
2429 
filename(FILE * fp)2430 const char *filename(FILE *fp)
2431 {
2432 	size_t i;
2433 
2434 	for (i = 0; i < nfiles; i++)
2435 		if (fp == files[i].fp)
2436 			return files[i].fname;
2437 	return "???";
2438 }
2439 
closefile(Node ** a,int n)2440 Cell *closefile(Node **a, int n)
2441 {
2442  	Cell *x;
2443 	size_t i;
2444 	bool stat;
2445 
2446  	x = execute(a[0]);
2447  	getsval(x);
2448 	stat = true;
2449  	for (i = 0; i < nfiles; i++) {
2450 		if (!files[i].fname || strcmp(x->sval, files[i].fname) != 0)
2451 			continue;
2452 		if (files[i].mode == GT || files[i].mode == '|')
2453 			fflush(files[i].fp);
2454 		if (ferror(files[i].fp)) {
2455 			if ((files[i].mode == GT && files[i].fp != stderr)
2456 			  || files[i].mode == '|')
2457 				FATAL("write error on %s", files[i].fname);
2458 			else
2459 				WARNING("i/o error occurred on %s", files[i].fname);
2460 		}
2461 		if (files[i].fp == stdin || files[i].fp == stdout ||
2462 		    files[i].fp == stderr)
2463 			stat = freopen("/dev/null", "r+", files[i].fp) == NULL;
2464 		else if (files[i].mode == '|' || files[i].mode == LE)
2465 			stat = pclose(files[i].fp) == -1;
2466 		else
2467 			stat = fclose(files[i].fp) == EOF;
2468 		if (stat)
2469 			WARNING("i/o error occurred closing %s", files[i].fname);
2470 		xfree(files[i].fname);
2471 		files[i].fname = NULL;	/* watch out for ref thru this */
2472 		files[i].fp = NULL;
2473 		break;
2474  	}
2475  	tempfree(x);
2476  	x = gettemp();
2477 	setfval(x, (Awkfloat) (stat ? -1 : 0));
2478  	return(x);
2479 }
2480 
closeall(void)2481 void closeall(void)
2482 {
2483 	size_t i;
2484 	bool stat = false;
2485 
2486 	for (i = 0; i < nfiles; i++) {
2487 		if (! files[i].fp)
2488 			continue;
2489 		if (files[i].mode == GT || files[i].mode == '|')
2490 			fflush(files[i].fp);
2491 		if (ferror(files[i].fp)) {
2492 			if ((files[i].mode == GT && files[i].fp != stderr)
2493 			  || files[i].mode == '|')
2494 				FATAL("write error on %s", files[i].fname);
2495 			else
2496 				WARNING("i/o error occurred on %s", files[i].fname);
2497 		}
2498 		if (files[i].fp == stdin || files[i].fp == stdout ||
2499 		    files[i].fp == stderr)
2500 			continue;
2501 		if (files[i].mode == '|' || files[i].mode == LE)
2502 			stat = pclose(files[i].fp) == -1;
2503 		else
2504 			stat = fclose(files[i].fp) == EOF;
2505 		if (stat)
2506 			WARNING("i/o error occurred while closing %s", files[i].fname);
2507 	}
2508 }
2509 
flush_all(void)2510 static void flush_all(void)
2511 {
2512 	size_t i;
2513 
2514 	for (i = 0; i < nfiles; i++)
2515 		if (files[i].fp)
2516 			fflush(files[i].fp);
2517 }
2518 
2519 void backsub(char **pb_ptr, const char **sptr_ptr);
2520 
dosub(Node ** a,int subop)2521 Cell *dosub(Node **a, int subop)        /* sub and gsub */
2522 {
2523 	fa *pfa;
2524 	int tempstat = 0;
2525 	char *repl;
2526 	Cell *x;
2527 
2528 	char *buf = NULL;
2529 	char *pb = NULL;
2530 	int bufsz = recsize;
2531 
2532 	const char *r, *s;
2533 	const char *start;
2534 	const char *noempty = NULL;      /* empty match disallowed here */
2535 	size_t m = 0;                    /* match count */
2536 	size_t whichm = 0;               /* which match to select, 0 = global */
2537 	int mtype;                       /* match type */
2538 
2539 	if (a[0] == NULL) {	/* 0 => a[1] is already-compiled regexpr */
2540 		pfa = (fa *) a[1];
2541 	} else {
2542 		x = execute(a[1]);
2543 		pfa = makedfa(getsval(x), 1);
2544 		tempfree(x);
2545 	}
2546 
2547 	x = execute(a[2]);	/* replacement string */
2548 	repl = tostring(getsval(x));
2549 	tempfree(x);
2550 
2551 	switch (subop) {
2552 	case SUB:
2553 		whichm = 1;
2554 		x = execute(a[3]);    /* source string */
2555 		break;
2556 	case GSUB:
2557 		whichm = 0;
2558 		x = execute(a[3]);    /* source string */
2559 		break;
2560 	default:
2561 		FATAL("dosub: unrecognized subop: %d", subop);
2562 	}
2563 
2564 	start = getsval(x);
2565 	while (pmatch(pfa, start)) {
2566 		if (buf == NULL) {
2567 			if ((pb = buf = (char *) malloc(bufsz)) == NULL)
2568 				FATAL("out of memory in dosub");
2569 			tempstat = pfa->initstat;
2570 			pfa->initstat = 2;
2571 		}
2572 
2573 		/* match types */
2574 		#define	MT_IGNORE  0  /* unselected or invalid */
2575 		#define MT_INSERT  1  /* selected, empty */
2576 		#define MT_REPLACE 2  /* selected, not empty */
2577 
2578 		/* an empty match just after replacement is invalid */
2579 
2580 		if (patbeg == noempty && patlen == 0) {
2581 			mtype = MT_IGNORE;    /* invalid, not counted */
2582 		} else if (whichm == ++m || whichm == 0) {
2583 			mtype = patlen ? MT_REPLACE : MT_INSERT;
2584 		} else {
2585 			mtype = MT_IGNORE;    /* unselected, but counted */
2586 		}
2587 
2588 		/* leading text: */
2589 		if (patbeg > start) {
2590 			adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),
2591 				recsize, &pb, "dosub");
2592 			s = start;
2593 			while (s < patbeg)
2594 				*pb++ = *s++;
2595 		}
2596 
2597 		if (mtype == MT_IGNORE)
2598 			goto matching_text;  /* skip replacement text */
2599 
2600 		r = repl;
2601 		while (*r != 0) {
2602 			adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");
2603 			if (*r == '\\') {
2604 				backsub(&pb, &r);
2605 			} else if (*r == '&') {
2606 				r++;
2607 				adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,
2608 					&pb, "dosub");
2609 				for (s = patbeg; s < patbeg+patlen; )
2610 					*pb++ = *s++;
2611 			} else {
2612 				*pb++ = *r++;
2613 			}
2614 		}
2615 
2616 matching_text:
2617 		if (mtype == MT_REPLACE || *patbeg == '\0')
2618 			goto next_search;  /* skip matching text */
2619 
2620 		if (patlen == 0)
2621 			patlen = u8_nextlen(patbeg);
2622 		adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");
2623 		s = patbeg;
2624 		while (s < patbeg + patlen)
2625 			*pb++ = *s++;
2626 
2627 next_search:
2628 		start = patbeg + patlen;
2629 		if (m == whichm || *patbeg == '\0')
2630 			break;
2631 		if (mtype == MT_REPLACE)
2632 			noempty = start;
2633 
2634 		#undef MT_IGNORE
2635 		#undef MT_INSERT
2636 		#undef MT_REPLACE
2637 	}
2638 
2639 	xfree(repl);
2640 
2641 	if (buf != NULL) {
2642 		pfa->initstat = tempstat;
2643 
2644 		/* trailing text */
2645 		adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");
2646 		while ((*pb++ = *start++) != '\0')
2647 			;
2648 
2649 		setsval(x, buf);
2650 		free(buf);
2651 	}
2652 
2653 	tempfree(x);
2654 	x = gettemp();
2655 	x->tval = NUM;
2656 	x->fval = m;
2657 	return x;
2658 }
2659 
gensub(Node ** a,int nnn)2660 Cell *gensub(Node **a, int nnn)	/* global selective substitute */
2661 	/* XXX incomplete - doesn't support backreferences \0 ... \9 */
2662 {
2663 	Cell *x, *y, *res, *h;
2664 	char *rptr;
2665 	const char *sptr;
2666 	char *buf, *pb;
2667 	const char *t, *q;
2668 	fa *pfa;
2669 	int mflag, tempstat, num, whichm;
2670 	int bufsz = recsize;
2671 
2672 	if ((buf = malloc(bufsz)) == NULL)
2673 		FATAL("out of memory in gensub");
2674 	mflag = 0;	/* if mflag == 0, can replace empty string */
2675 	num = 0;
2676 	x = execute(a[4]);	/* source string */
2677 	t = getsval(x);
2678 	res = copycell(x);	/* target string - initially copy of source */
2679 	res->csub = CTEMP;	/* result values are temporary */
2680 	if (a[0] == 0)		/* 0 => a[1] is already-compiled regexpr */
2681 		pfa = (fa *) a[1];	/* regular expression */
2682 	else {
2683 		y = execute(a[1]);
2684 		pfa = makedfa(getsval(y), 1);
2685 		tempfree(y);
2686 	}
2687 	y = execute(a[2]);	/* replacement string */
2688 	h = execute(a[3]);	/* which matches should be replaced */
2689 	sptr = getsval(h);
2690 	if (sptr[0] == 'g' || sptr[0] == 'G')
2691 		whichm = -1;
2692 	else {
2693 		/*
2694 		 * The specified number is index of replacement, starting
2695 		 * from 1. GNU awk treats index lower than 0 same as
2696 		 * 1, we do same for compatibility.
2697 		 */
2698 		whichm = (int) getfval(h) - 1;
2699 		if (whichm < 0)
2700 			whichm = 0;
2701 	}
2702 	tempfree(h);
2703 
2704 	if (pmatch(pfa, t)) {
2705 		char *sl;
2706 
2707 		tempstat = pfa->initstat;
2708 		pfa->initstat = 2;
2709 		pb = buf;
2710 		rptr = getsval(y);
2711 		/*
2712 		 * XXX if there are any backreferences in subst string,
2713 		 * complain now.
2714 		 */
2715 		for (sl = rptr; (sl = strchr(sl, '\\')) && sl[1]; sl++) {
2716 			if (strchr("0123456789", sl[1])) {
2717 				FATAL("gensub doesn't support backreferences (subst \"%s\")", rptr);
2718 			}
2719 		}
2720 
2721 		do {
2722 			if (whichm >= 0 && whichm != num) {
2723 				num++;
2724 				adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - t) + patlen, recsize, &pb, "gensub");
2725 
2726 				/* copy the part of string up to and including
2727 				 * match to output buffer */
2728 				while (t < patbeg + patlen)
2729 					*pb++ = *t++;
2730 				continue;
2731 			}
2732 
2733 			if (patlen == 0 && *patbeg != 0) {	/* matched empty string */
2734 				if (mflag == 0) {	/* can replace empty */
2735 					num++;
2736 					sptr = rptr;
2737 					while (*sptr != 0) {
2738 						adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2739 						if (*sptr == '\\') {
2740 							backsub(&pb, &sptr);
2741 						} else if (*sptr == '&') {
2742 							sptr++;
2743 							adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2744 							for (q = patbeg; q < patbeg+patlen; )
2745 								*pb++ = *q++;
2746 						} else
2747 							*pb++ = *sptr++;
2748 					}
2749 				}
2750 				if (*t == 0)	/* at end */
2751 					goto done;
2752 				adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gensub");
2753 				*pb++ = *t++;
2754 				if (pb > buf + bufsz)	/* BUG: not sure of this test */
2755 					FATAL("gensub result0 %.30s too big; can't happen", buf);
2756 				mflag = 0;
2757 			}
2758 			else {	/* matched nonempty string */
2759 				num++;
2760 				sptr = t;
2761 				adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gensub");
2762 				while (sptr < patbeg)
2763 					*pb++ = *sptr++;
2764 				sptr = rptr;
2765 				while (*sptr != 0) {
2766 					adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2767 					if (*sptr == '\\') {
2768 						backsub(&pb, &sptr);
2769 					} else if (*sptr == '&') {
2770 						sptr++;
2771 						adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2772 						for (q = patbeg; q < patbeg+patlen; )
2773 							*pb++ = *q++;
2774 					} else
2775 						*pb++ = *sptr++;
2776 				}
2777 				t = patbeg + patlen;
2778 				if (patlen == 0 || *t == 0 || *(t-1) == 0)
2779 					goto done;
2780 				if (pb > buf + bufsz)
2781 					FATAL("gensub result1 %.30s too big; can't happen", buf);
2782 				mflag = 1;
2783 			}
2784 		} while (pmatch(pfa,t));
2785 		sptr = t;
2786 		adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gensub");
2787 		while ((*pb++ = *sptr++) != 0)
2788 			;
2789 	done:	if (pb > buf + bufsz)
2790 			FATAL("gensub result2 %.30s too big; can't happen", buf);
2791 		*pb = '\0';
2792 		setsval(res, buf);
2793 		pfa->initstat = tempstat;
2794 	}
2795 	tempfree(x);
2796 	tempfree(y);
2797 	free(buf);
2798 	return(res);
2799 }
2800 
backsub(char ** pb_ptr,const char ** sptr_ptr)2801 void backsub(char **pb_ptr, const char **sptr_ptr)	/* handle \\& variations */
2802 {						/* sptr[0] == '\\' */
2803 	char *pb = *pb_ptr;
2804 	const char *sptr = *sptr_ptr;
2805 	static bool first = true;
2806 	static bool do_posix = false;
2807 
2808 	if (first) {
2809 		first = false;
2810 		do_posix = (getenv("POSIXLY_CORRECT") != NULL);
2811 	}
2812 
2813 	if (sptr[1] == '\\') {
2814 		if (sptr[2] == '\\' && sptr[3] == '&') { /* \\\& -> \& */
2815 			*pb++ = '\\';
2816 			*pb++ = '&';
2817 			sptr += 4;
2818 		} else if (sptr[2] == '&') {	/* \\& -> \ + matched */
2819 			*pb++ = '\\';
2820 			sptr += 2;
2821 		} else if (do_posix) {		/* \\x -> \x */
2822 			sptr++;
2823 			*pb++ = *sptr++;
2824 		} else {			/* \\x -> \\x */
2825 			*pb++ = *sptr++;
2826 			*pb++ = *sptr++;
2827 		}
2828 	} else if (sptr[1] == '&') {	/* literal & */
2829 		sptr++;
2830 		*pb++ = *sptr++;
2831 	} else				/* literal \ */
2832 		*pb++ = *sptr++;
2833 
2834 	*pb_ptr = pb;
2835 	*sptr_ptr = sptr;
2836 }
2837 
wide_char_to_byte_str(int rune,size_t * outlen)2838 static char *wide_char_to_byte_str(int rune, size_t *outlen)
2839 {
2840 	static char buf[5];
2841 	int len;
2842 
2843 	if (rune < 0 || rune > 0x10FFFF)
2844 		return NULL;
2845 
2846 	memset(buf, 0, sizeof(buf));
2847 
2848 	len = 0;
2849 	if (rune <= 0x0000007F) {
2850 		buf[len++] = rune;
2851 	} else if (rune <= 0x000007FF) {
2852 		// 110xxxxx 10xxxxxx
2853 		buf[len++] = 0xC0 | (rune >> 6);
2854 		buf[len++] = 0x80 | (rune & 0x3F);
2855 	} else if (rune <= 0x0000FFFF) {
2856 		// 1110xxxx 10xxxxxx 10xxxxxx
2857 		buf[len++] = 0xE0 | (rune >> 12);
2858 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2859 		buf[len++] = 0x80 | (rune & 0x3F);
2860 
2861 	} else {
2862 		// 0x00010000 - 0x10FFFF
2863 		// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2864 		buf[len++] = 0xF0 | (rune >> 18);
2865 		buf[len++] = 0x80 | ((rune >> 12) & 0x3F);
2866 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2867 		buf[len++] = 0x80 | (rune & 0x3F);
2868 	}
2869 
2870 	*outlen = len;
2871 	buf[len++] = '\0';
2872 
2873 	return buf;
2874 }
2875