xref: /freebsd/contrib/one-true-awk/run.c (revision dd78d987cb38ef162d40aad86229f1dc19884f78)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #define DEBUG
26 #include <stdio.h>
27 #include <ctype.h>
28 #include <errno.h>
29 #include <wctype.h>
30 #include <fcntl.h>
31 #include <setjmp.h>
32 #include <limits.h>
33 #include <math.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include <time.h>
37 #include <sys/types.h>
38 #include <sys/stat.h>
39 #include <sys/wait.h>
40 #include "awk.h"
41 #include "awkgram.tab.h"
42 
43 
44 static void stdinit(void);
45 static void flush_all(void);
46 static char *wide_char_to_byte_str(int rune, size_t *outlen);
47 
48 #if 1
49 #define tempfree(x)	do { if (istemp(x)) tfree(x); } while (/*CONSTCOND*/0)
50 #else
tempfree(Cell * p)51 void tempfree(Cell *p) {
52 	if (p->ctype == OCELL && (p->csub < CUNK || p->csub > CFREE)) {
53 		WARNING("bad csub %d in Cell %d %s",
54 			p->csub, p->ctype, p->sval);
55 	}
56 	if (istemp(p))
57 		tfree(p);
58 }
59 #endif
60 
61 /* do we really need these? */
62 /* #ifdef _NFILE */
63 /* #ifndef FOPEN_MAX */
64 /* #define FOPEN_MAX _NFILE */
65 /* #endif */
66 /* #endif */
67 /*  */
68 /* #ifndef	FOPEN_MAX */
69 /* #define	FOPEN_MAX	40 */	/* max number of open files */
70 /* #endif */
71 /*  */
72 /* #ifndef RAND_MAX */
73 /* #define RAND_MAX	32767 */	/* all that ansi guarantees */
74 /* #endif */
75 
76 jmp_buf env;
77 extern	int	pairstack[];
78 extern	Awkfloat	srand_seed;
79 
80 Node	*winner = NULL;	/* root of parse tree */
81 Cell	*tmps;		/* free temporary cells for execution */
82 
83 static Cell	truecell	={ OBOOL, BTRUE, 0, 0, 1.0, NUM, NULL, NULL };
84 Cell	*True	= &truecell;
85 static Cell	falsecell	={ OBOOL, BFALSE, 0, 0, 0.0, NUM, NULL, NULL };
86 Cell	*False	= &falsecell;
87 static Cell	breakcell	={ OJUMP, JBREAK, 0, 0, 0.0, NUM, NULL, NULL };
88 Cell	*jbreak	= &breakcell;
89 static Cell	contcell	={ OJUMP, JCONT, 0, 0, 0.0, NUM, NULL, NULL };
90 Cell	*jcont	= &contcell;
91 static Cell	nextcell	={ OJUMP, JNEXT, 0, 0, 0.0, NUM, NULL, NULL };
92 Cell	*jnext	= &nextcell;
93 static Cell	nextfilecell	={ OJUMP, JNEXTFILE, 0, 0, 0.0, NUM, NULL, NULL };
94 Cell	*jnextfile	= &nextfilecell;
95 static Cell	exitcell	={ OJUMP, JEXIT, 0, 0, 0.0, NUM, NULL, NULL };
96 Cell	*jexit	= &exitcell;
97 static Cell	retcell		={ OJUMP, JRET, 0, 0, 0.0, NUM, NULL, NULL };
98 Cell	*jret	= &retcell;
99 static Cell	tempcell	={ OCELL, CTEMP, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
100 
101 Node	*curnode = NULL;	/* the node being executed, for debugging */
102 
103 /* buffer memory management */
adjbuf(char ** pbuf,int * psiz,int minlen,int quantum,char ** pbptr,const char * whatrtn)104 int adjbuf(char **pbuf, int *psiz, int minlen, int quantum, char **pbptr,
105 	const char *whatrtn)
106 /* pbuf:    address of pointer to buffer being managed
107  * psiz:    address of buffer size variable
108  * minlen:  minimum length of buffer needed
109  * quantum: buffer size quantum
110  * pbptr:   address of movable pointer into buffer, or 0 if none
111  * whatrtn: name of the calling routine if failure should cause fatal error
112  *
113  * return   0 for realloc failure, !=0 for success
114  */
115 {
116 	if (minlen > *psiz) {
117 		char *tbuf;
118 		int rminlen = quantum ? minlen % quantum : 0;
119 		int boff = pbptr ? *pbptr - *pbuf : 0;
120 		/* round up to next multiple of quantum */
121 		if (rminlen)
122 			minlen += quantum - rminlen;
123 		tbuf = (char *) realloc(*pbuf, minlen);
124 		DPRINTF("adjbuf %s: %d %d (pbuf=%p, tbuf=%p)\n", whatrtn, *psiz, minlen, (void*)*pbuf, (void*)tbuf);
125 		if (tbuf == NULL) {
126 			if (whatrtn)
127 				FATAL("out of memory in %s", whatrtn);
128 			return 0;
129 		}
130 		*pbuf = tbuf;
131 		*psiz = minlen;
132 		if (pbptr)
133 			*pbptr = tbuf + boff;
134 	}
135 	return 1;
136 }
137 
run(Node * a)138 void run(Node *a)	/* execution of parse tree starts here */
139 {
140 
141 	stdinit();
142 	execute(a);
143 	closeall();
144 }
145 
execute(Node * u)146 Cell *execute(Node *u)	/* execute a node of the parse tree */
147 {
148 	Cell *(*proc)(Node **, int);
149 	Cell *x;
150 	Node *a;
151 
152 	if (u == NULL)
153 		return(True);
154 	for (a = u; ; a = a->nnext) {
155 		curnode = a;
156 		if (isvalue(a)) {
157 			x = (Cell *) (a->narg[0]);
158 			if (isfld(x) && !donefld)
159 				fldbld();
160 			else if (isrec(x) && !donerec)
161 				recbld();
162 			return(x);
163 		}
164 		if (notlegal(a->nobj))	/* probably a Cell* but too risky to print */
165 			FATAL("illegal statement");
166 		proc = proctab[a->nobj-FIRSTTOKEN];
167 		x = (*proc)(a->narg, a->nobj);
168 		if (isfld(x) && !donefld)
169 			fldbld();
170 		else if (isrec(x) && !donerec)
171 			recbld();
172 		if (isexpr(a))
173 			return(x);
174 		if (isjump(x))
175 			return(x);
176 		if (a->nnext == NULL)
177 			return(x);
178 		tempfree(x);
179 	}
180 }
181 
182 
program(Node ** a,int n)183 Cell *program(Node **a, int n)	/* execute an awk program */
184 {				/* a[0] = BEGIN, a[1] = body, a[2] = END */
185 	Cell *x;
186 
187 	if (setjmp(env) != 0)
188 		goto ex;
189 	if (a[0]) {		/* BEGIN */
190 		x = execute(a[0]);
191 		if (isexit(x))
192 			return(True);
193 		if (isjump(x))
194 			FATAL("illegal break, continue, next or nextfile from BEGIN");
195 		tempfree(x);
196 	}
197 	if (a[1] || a[2])
198 		while (getrec(&record, &recsize, true) > 0) {
199 			x = execute(a[1]);
200 			if (isexit(x))
201 				break;
202 			tempfree(x);
203 		}
204   ex:
205 	if (setjmp(env) != 0)	/* handles exit within END */
206 		goto ex1;
207 	if (a[2]) {		/* END */
208 		x = execute(a[2]);
209 		if (isbreak(x) || isnext(x) || iscont(x))
210 			FATAL("illegal break, continue, next or nextfile from END");
211 		tempfree(x);
212 	}
213   ex1:
214 	return(True);
215 }
216 
217 struct Frame {	/* stack frame for awk function calls */
218 	int nargs;	/* number of arguments in this call */
219 	Cell *fcncell;	/* pointer to Cell for function */
220 	Cell **args;	/* pointer to array of arguments after execute */
221 	Cell *retval;	/* return value */
222 };
223 
224 #define	NARGS	50	/* max args in a call */
225 
226 struct Frame *frame = NULL;	/* base of stack frames; dynamically allocated */
227 int	nframe = 0;		/* number of frames allocated */
228 struct Frame *frp = NULL;	/* frame pointer. bottom level unused */
229 
call(Node ** a,int n)230 Cell *call(Node **a, int n)	/* function call.  very kludgy and fragile */
231 {
232 	static const Cell newcopycell = { OCELL, CCOPY, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
233 	int i, ncall, ndef;
234 	int freed = 0; /* handles potential double freeing when fcn & param share a tempcell */
235 	Node *x;
236 	Cell *args[NARGS], *oargs[NARGS];	/* BUG: fixed size arrays */
237 	Cell *y, *z, *fcn;
238 	char *s;
239 
240 	fcn = execute(a[0]);	/* the function itself */
241 	s = fcn->nval;
242 	if (!isfcn(fcn))
243 		FATAL("calling undefined function %s", s);
244 	if (frame == NULL) {
245 		frp = frame = (struct Frame *) calloc(nframe += 100, sizeof(*frame));
246 		if (frame == NULL)
247 			FATAL("out of space for stack frames calling %s", s);
248 	}
249 	for (ncall = 0, x = a[1]; x != NULL; x = x->nnext)	/* args in call */
250 		ncall++;
251 	ndef = (int) fcn->fval;			/* args in defn */
252 	DPRINTF("calling %s, %d args (%d in defn), frp=%d\n", s, ncall, ndef, (int) (frp-frame));
253 	if (ncall > ndef)
254 		WARNING("function %s called with %d args, uses only %d",
255 			s, ncall, ndef);
256 	if (ncall + ndef > NARGS)
257 		FATAL("function %s has %d arguments, limit %d", s, ncall+ndef, NARGS);
258 	for (i = 0, x = a[1]; x != NULL; i++, x = x->nnext) {	/* get call args */
259 		DPRINTF("evaluate args[%d], frp=%d:\n", i, (int) (frp-frame));
260 		y = execute(x);
261 		oargs[i] = y;
262 		DPRINTF("args[%d]: %s %f <%s>, t=%o\n",
263 			i, NN(y->nval), y->fval, isarr(y) ? "(array)" : NN(y->sval), y->tval);
264 		if (isfcn(y))
265 			FATAL("can't use function %s as argument in %s", y->nval, s);
266 		if (isarr(y))
267 			args[i] = y;	/* arrays by ref */
268 		else
269 			args[i] = copycell(y);
270 		tempfree(y);
271 	}
272 	for ( ; i < ndef; i++) {	/* add null args for ones not provided */
273 		args[i] = gettemp();
274 		*args[i] = newcopycell;
275 	}
276 	frp++;	/* now ok to up frame */
277 	if (frp >= frame + nframe) {
278 		int dfp = frp - frame;	/* old index */
279 		frame = (struct Frame *) realloc(frame, (nframe += 100) * sizeof(*frame));
280 		if (frame == NULL)
281 			FATAL("out of space for stack frames in %s", s);
282 		frp = frame + dfp;
283 	}
284 	frp->fcncell = fcn;
285 	frp->args = args;
286 	frp->nargs = ndef;	/* number defined with (excess are locals) */
287 	frp->retval = gettemp();
288 
289 	DPRINTF("start exec of %s, frp=%d\n", s, (int) (frp-frame));
290 	y = execute((Node *)(fcn->sval));	/* execute body */
291 	DPRINTF("finished exec of %s, frp=%d\n", s, (int) (frp-frame));
292 
293 	for (i = 0; i < ndef; i++) {
294 		Cell *t = frp->args[i];
295 		if (isarr(t)) {
296 			if (t->csub == CCOPY) {
297 				if (i >= ncall) {
298 					freesymtab(t);
299 					t->csub = CTEMP;
300 					tempfree(t);
301 				} else {
302 					oargs[i]->tval = t->tval;
303 					oargs[i]->tval &= ~(STR|NUM|DONTFREE);
304 					oargs[i]->sval = t->sval;
305 					tempfree(t);
306 				}
307 			}
308 		} else if (t != y) {	/* kludge to prevent freeing twice */
309 			t->csub = CTEMP;
310 			tempfree(t);
311 		} else if (t == y && t->csub == CCOPY) {
312 			t->csub = CTEMP;
313 			tempfree(t);
314 			freed = 1;
315 		}
316 	}
317 	tempfree(fcn);
318 	if (isexit(y) || isnext(y))
319 		return y;
320 	if (freed == 0) {
321 		tempfree(y);	/* don't free twice! */
322 	}
323 	z = frp->retval;			/* return value */
324 	DPRINTF("%s returns %g |%s| %o\n", s, getfval(z), getsval(z), z->tval);
325 	frp--;
326 	return(z);
327 }
328 
copycell(Cell * x)329 Cell *copycell(Cell *x)	/* make a copy of a cell in a temp */
330 {
331 	Cell *y;
332 
333 	/* copy is not constant or field */
334 
335 	y = gettemp();
336 	y->tval = x->tval & ~(CON|FLD|REC);
337 	y->csub = CCOPY;	/* prevents freeing until call is over */
338 	y->nval = x->nval;	/* BUG? */
339 	if (isstr(x) /* || x->ctype == OCELL */) {
340 		y->sval = tostring(x->sval);
341 		y->tval &= ~DONTFREE;
342 	} else
343 		y->tval |= DONTFREE;
344 	y->fval = x->fval;
345 	return y;
346 }
347 
arg(Node ** a,int n)348 Cell *arg(Node **a, int n)	/* nth argument of a function */
349 {
350 
351 	n = ptoi(a[0]);	/* argument number, counting from 0 */
352 	DPRINTF("arg(%d), frp->nargs=%d\n", n, frp->nargs);
353 	if (n+1 > frp->nargs)
354 		FATAL("argument #%d of function %s was not supplied",
355 			n+1, frp->fcncell->nval);
356 	return frp->args[n];
357 }
358 
jump(Node ** a,int n)359 Cell *jump(Node **a, int n)	/* break, continue, next, nextfile, return */
360 {
361 	Cell *y;
362 
363 	switch (n) {
364 	case EXIT:
365 		if (a[0] != NULL) {
366 			y = execute(a[0]);
367 			errorflag = (int) getfval(y);
368 			tempfree(y);
369 		}
370 		longjmp(env, 1);
371 	case RETURN:
372 		if (a[0] != NULL) {
373 			y = execute(a[0]);
374 			if ((y->tval & (STR|NUM)) == (STR|NUM)) {
375 				setsval(frp->retval, getsval(y));
376 				frp->retval->fval = getfval(y);
377 				frp->retval->tval |= NUM;
378 			}
379 			else if (y->tval & STR)
380 				setsval(frp->retval, getsval(y));
381 			else if (y->tval & NUM)
382 				setfval(frp->retval, getfval(y));
383 			else		/* can't happen */
384 				FATAL("bad type variable %d", y->tval);
385 			tempfree(y);
386 		}
387 		return(jret);
388 	case NEXT:
389 		return(jnext);
390 	case NEXTFILE:
391 		nextfile();
392 		return(jnextfile);
393 	case BREAK:
394 		return(jbreak);
395 	case CONTINUE:
396 		return(jcont);
397 	default:	/* can't happen */
398 		FATAL("illegal jump type %d", n);
399 	}
400 	return 0;	/* not reached */
401 }
402 
awkgetline(Node ** a,int n)403 Cell *awkgetline(Node **a, int n)	/* get next line from specific input */
404 {		/* a[0] is variable, a[1] is operator, a[2] is filename */
405 	Cell *r, *x;
406 	extern Cell **fldtab;
407 	FILE *fp;
408 	char *buf;
409 	int bufsize = recsize;
410 	int mode;
411 	bool newflag;
412 	double result;
413 
414 	if ((buf = (char *) malloc(bufsize)) == NULL)
415 		FATAL("out of memory in getline");
416 
417 	fflush(stdout);	/* in case someone is waiting for a prompt */
418 	r = gettemp();
419 	if (a[1] != NULL) {		/* getline < file */
420 		x = execute(a[2]);		/* filename */
421 		mode = ptoi(a[1]);
422 		if (mode == '|')		/* input pipe */
423 			mode = LE;	/* arbitrary flag */
424 		fp = openfile(mode, getsval(x), &newflag);
425 		tempfree(x);
426 		if (fp == NULL)
427 			n = -1;
428 		else
429 			n = readrec(&buf, &bufsize, fp, newflag);
430 		if (n <= 0) {
431 			;
432 		} else if (a[0] != NULL) {	/* getline var <file */
433 			x = execute(a[0]);
434 			setsval(x, buf);
435 			if (is_number(x->sval, & result)) {
436 				x->fval = result;
437 				x->tval |= NUM;
438 			}
439 			tempfree(x);
440 		} else {			/* getline <file */
441 			setsval(fldtab[0], buf);
442 			if (is_number(fldtab[0]->sval, & result)) {
443 				fldtab[0]->fval = result;
444 				fldtab[0]->tval |= NUM;
445 			}
446 		}
447 	} else {			/* bare getline; use current input */
448 		if (a[0] == NULL)	/* getline */
449 			n = getrec(&record, &recsize, true);
450 		else {			/* getline var */
451 			n = getrec(&buf, &bufsize, false);
452 			if (n > 0) {
453 				x = execute(a[0]);
454 				setsval(x, buf);
455 				if (is_number(x->sval, & result)) {
456 					x->fval = result;
457 					x->tval |= NUM;
458 				}
459 				tempfree(x);
460 			}
461 		}
462 	}
463 	setfval(r, (Awkfloat) n);
464 	free(buf);
465 	return r;
466 }
467 
getnf(Node ** a,int n)468 Cell *getnf(Node **a, int n)	/* get NF */
469 {
470 	if (!donefld)
471 		fldbld();
472 	return (Cell *) a[0];
473 }
474 
475 static char *
makearraystring(Node * p,const char * func)476 makearraystring(Node *p, const char *func)
477 {
478 	char *buf;
479 	int bufsz = recsize;
480 	size_t blen;
481 
482 	if ((buf = (char *) malloc(bufsz)) == NULL) {
483 		FATAL("%s: out of memory", func);
484 	}
485 
486 	blen = 0;
487 	buf[blen] = '\0';
488 
489 	for (; p; p = p->nnext) {
490 		Cell *x = execute(p);	/* expr */
491 		char *s = getsval(x);
492 		size_t seplen = strlen(getsval(subseploc));
493 		size_t nsub = p->nnext ? seplen : 0;
494 		size_t slen = strlen(s);
495 		size_t tlen = blen + slen + nsub;
496 
497 		if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
498 			FATAL("%s: out of memory %s[%s...]",
499 			    func, x->nval, buf);
500 		}
501 		memcpy(buf + blen, s, slen);
502 		if (nsub) {
503 			memcpy(buf + blen + slen, *SUBSEP, nsub);
504 		}
505 		buf[tlen] = '\0';
506 		blen = tlen;
507 		tempfree(x);
508 	}
509 	return buf;
510 }
511 
array(Node ** a,int n)512 Cell *array(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
513 {
514 	Cell *x, *z;
515 	char *buf;
516 
517 	x = execute(a[0]);	/* Cell* for symbol table */
518 	buf = makearraystring(a[1], __func__);
519 	if (!isarr(x)) {
520 		DPRINTF("making %s into an array\n", NN(x->nval));
521 		if (freeable(x))
522 			xfree(x->sval);
523 		x->tval &= ~(STR|NUM|DONTFREE);
524 		x->tval |= ARR;
525 		x->sval = (char *) makesymtab(NSYMTAB);
526 	}
527 	z = setsymtab(buf, "", 0.0, STR|NUM, (Array *) x->sval);
528 	z->ctype = OCELL;
529 	z->csub = CVAR;
530 	tempfree(x);
531 	free(buf);
532 	return(z);
533 }
534 
awkdelete(Node ** a,int n)535 Cell *awkdelete(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
536 {
537 	Cell *x;
538 
539 	x = execute(a[0]);	/* Cell* for symbol table */
540 	if (x == symtabloc) {
541 		FATAL("cannot delete SYMTAB or its elements");
542 	}
543 	if (!isarr(x))
544 		return True;
545 	if (a[1] == NULL) {	/* delete the elements, not the table */
546 		freesymtab(x);
547 		x->tval &= ~STR;
548 		x->tval |= ARR;
549 		x->sval = (char *) makesymtab(NSYMTAB);
550 	} else {
551 		char *buf = makearraystring(a[1], __func__);
552 		freeelem(x, buf);
553 		free(buf);
554 	}
555 	tempfree(x);
556 	return True;
557 }
558 
intest(Node ** a,int n)559 Cell *intest(Node **a, int n)	/* a[0] is index (list), a[1] is symtab */
560 {
561 	Cell *ap, *k;
562 	char *buf;
563 
564 	ap = execute(a[1]);	/* array name */
565 	if (!isarr(ap)) {
566 		DPRINTF("making %s into an array\n", ap->nval);
567 		if (freeable(ap))
568 			xfree(ap->sval);
569 		ap->tval &= ~(STR|NUM|DONTFREE);
570 		ap->tval |= ARR;
571 		ap->sval = (char *) makesymtab(NSYMTAB);
572 	}
573 	buf = makearraystring(a[0], __func__);
574 	k = lookup(buf, (Array *) ap->sval);
575 	tempfree(ap);
576 	free(buf);
577 	if (k == NULL)
578 		return(False);
579 	else
580 		return(True);
581 }
582 
583 
584 /* ======== utf-8 code ========== */
585 
586 /*
587  * Awk strings can contain ascii, random 8-bit items (eg Latin-1),
588  * or utf-8.  u8_isutf tests whether a string starts with a valid
589  * utf-8 sequence, and returns 0 if not (e.g., high bit set).
590  * u8_nextlen returns length of next valid sequence, which is
591  * 1 for ascii, 2..4 for utf-8, or 1 for high bit non-utf.
592  * u8_strlen returns length of string in valid utf-8 sequences
593  * and/or high-bit bytes.  Conversion functions go between byte
594  * number and character number.
595  *
596  * In theory, this behaves the same as before for non-utf8 bytes.
597  *
598  * Limited checking! This is a potential security hole.
599  */
600 
601 /* is s the beginning of a valid utf-8 string? */
602 /* return length 1..4 if yes, 0 if no */
u8_isutf(const char * s)603 int u8_isutf(const char *s)
604 {
605 	int n, ret;
606 	unsigned char c;
607 
608 	c = s[0];
609 	if (c < 128 || awk_mb_cur_max == 1)
610 		return 1; /* what if it's 0? */
611 
612 	n = strlen(s);
613 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
614 		ret = 2; /* 110xxxxx 10xxxxxx */
615 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
616 			 && (s[2] & 0xC0) == 0x80) {
617 		ret = 3; /* 1110xxxx 10xxxxxx 10xxxxxx */
618 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
619 			 && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
620 		ret = 4; /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
621 	} else {
622 		ret = 0;
623 	}
624 	return ret;
625 }
626 
627 /* Convert (prefix of) utf8 string to utf-32 rune. */
628 /* Sets *rune to the value, returns the length. */
629 /* No error checking: watch out. */
u8_rune(int * rune,const char * s)630 int u8_rune(int *rune, const char *s)
631 {
632 	int n, ret;
633 	unsigned char c;
634 
635 	c = s[0];
636 	if (c < 128 || awk_mb_cur_max == 1) {
637 		*rune = c;
638 		return 1;
639 	}
640 
641 	n = strlen(s);
642 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
643 		*rune = ((c & 0x1F) << 6) | (s[1] & 0x3F); /* 110xxxxx 10xxxxxx */
644 		ret = 2;
645 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
646 			  && (s[2] & 0xC0) == 0x80) {
647 		*rune = ((c & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
648 			/* 1110xxxx 10xxxxxx 10xxxxxx */
649 		ret = 3;
650 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
651 			  && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
652 		*rune = ((c & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
653 			/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
654 		ret = 4;
655 	} else {
656 		*rune = c;
657 		ret = 1;
658 	}
659 	return ret; /* returns one byte if sequence doesn't look like utf */
660 }
661 
662 /* return length of next sequence: 1 for ascii or random, 2..4 for valid utf8 */
u8_nextlen(const char * s)663 int u8_nextlen(const char *s)
664 {
665 	int len;
666 
667 	len = u8_isutf(s);
668 	if (len == 0)
669 		len = 1;
670 	return len;
671 }
672 
673 /* return number of utf characters or single non-utf bytes */
u8_strlen(const char * s)674 int u8_strlen(const char *s)
675 {
676 	int i, len, n, totlen;
677 	unsigned char c;
678 
679 	n = strlen(s);
680 	totlen = 0;
681 	for (i = 0; i < n; i += len) {
682 		c = s[i];
683 		if (c < 128 || awk_mb_cur_max == 1) {
684 			len = 1;
685 		} else {
686 			len = u8_nextlen(&s[i]);
687 		}
688 		totlen++;
689 		if (i > n)
690 			FATAL("bad utf count [%s] n=%d i=%d\n", s, n, i);
691 	}
692 	return totlen;
693 }
694 
695 /* convert utf-8 char number in a string to its byte offset */
u8_char2byte(const char * s,int charnum)696 int u8_char2byte(const char *s, int charnum)
697 {
698 	int n;
699 	int bytenum = 0;
700 
701 	while (charnum > 0) {
702 		n = u8_nextlen(s);
703 		s += n;
704 		bytenum += n;
705 		charnum--;
706 	}
707 	return bytenum;
708 }
709 
710 /* convert byte offset in s to utf-8 char number that starts there */
u8_byte2char(const char * s,int bytenum)711 int u8_byte2char(const char *s, int bytenum)
712 {
713 	int i, len, b;
714 	int charnum = 0; /* BUG: what origin? */
715 	/* should be 0 to match start==0 which means no match */
716 
717 	b = strlen(s);
718 	if (bytenum > b) {
719 		return -1; /* ??? */
720 	}
721 	for (i = 0; i <= bytenum; i += len) {
722 		len = u8_nextlen(s+i);
723 		charnum++;
724 	}
725 	return charnum;
726 }
727 
728 /* runetochar() adapted from rune.c in the Plan 9 distribution */
729 
730 enum
731 {
732 	Runeerror = 128, /* from somewhere else */
733 	Runemax = 0x10FFFF,
734 
735 	Bit1    = 7,
736 	Bitx    = 6,
737 	Bit2    = 5,
738 	Bit3    = 4,
739 	Bit4    = 3,
740 	Bit5    = 2,
741 
742 	T1      = ((1<<(Bit1+1))-1) ^ 0xFF,     /* 0000 0000 */
743 	Tx      = ((1<<(Bitx+1))-1) ^ 0xFF,     /* 1000 0000 */
744 	T2      = ((1<<(Bit2+1))-1) ^ 0xFF,     /* 1100 0000 */
745 	T3      = ((1<<(Bit3+1))-1) ^ 0xFF,     /* 1110 0000 */
746 	T4      = ((1<<(Bit4+1))-1) ^ 0xFF,     /* 1111 0000 */
747 	T5      = ((1<<(Bit5+1))-1) ^ 0xFF,     /* 1111 1000 */
748 
749 	Rune1   = (1<<(Bit1+0*Bitx))-1,	 	/* 0000 0000 0000 0000 0111 1111 */
750 	Rune2   = (1<<(Bit2+1*Bitx))-1,	 	/* 0000 0000 0000 0111 1111 1111 */
751 	Rune3   = (1<<(Bit3+2*Bitx))-1,	 	/* 0000 0000 1111 1111 1111 1111 */
752 	Rune4   = (1<<(Bit4+3*Bitx))-1,	 	/* 0011 1111 1111 1111 1111 1111 */
753 
754 	Maskx   = (1<<Bitx)-1,		  	/* 0011 1111 */
755 	Testx   = Maskx ^ 0xFF,		 	/* 1100 0000 */
756 
757 };
758 
runetochar(char * str,int c)759 int runetochar(char *str, int c)
760 {
761 	/* one character sequence 00000-0007F => 00-7F */
762 	if (c <= Rune1) {
763 		str[0] = c;
764 		return 1;
765 	}
766 
767 	/* two character sequence 00080-007FF => T2 Tx */
768 	if (c <= Rune2) {
769 		str[0] = T2 | (c >> 1*Bitx);
770 		str[1] = Tx | (c & Maskx);
771 		return 2;
772 	}
773 
774 	/* three character sequence 00800-0FFFF => T3 Tx Tx */
775 	if (c > Runemax)
776 		c = Runeerror;
777 	if (c <= Rune3) {
778 		str[0] = T3 |  (c >> 2*Bitx);
779 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
780 		str[2] = Tx |  (c & Maskx);
781 		return 3;
782 	}
783 
784 	/* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */
785 	str[0] = T4 |  (c >> 3*Bitx);
786 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
787 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
788 	str[3] = Tx |  (c & Maskx);
789 	return 4;
790 }
791 
792 
793 /* ========== end of utf8 code =========== */
794 
795 
796 
matchop(Node ** a,int n)797 Cell *matchop(Node **a, int n)	/* ~ and match() */
798 {
799 	Cell *x, *y, *z;
800 	char *s, *t;
801 	int i;
802 	int cstart, cpatlen, len;
803 	fa *pfa;
804 	int (*mf)(fa *, const char *) = match, mode = 0;
805 
806 	if (n == MATCHFCN) {
807 		mf = pmatch;
808 		mode = 1;
809 	}
810 	x = execute(a[1]);	/* a[1] = target text */
811 	s = getsval(x);
812 	if (a[0] == NULL)	/* a[1] == 0: already-compiled reg expr */
813 		i = (*mf)((fa *) a[2], s);
814 	else {
815 		y = execute(a[2]);	/* a[2] = regular expr */
816 		t = getsval(y);
817 		pfa = makedfa(t, mode);
818 		i = (*mf)(pfa, s);
819 		tempfree(y);
820 	}
821 	z = x;
822 	if (n == MATCHFCN) {
823 		int start = patbeg - s + 1; /* origin 1 */
824 		if (patlen < 0) {
825 			start = 0; /* not found */
826 		} else {
827 			cstart = u8_byte2char(s, start-1);
828 			cpatlen = 0;
829 			for (i = 0; i < patlen; i += len) {
830 				len = u8_nextlen(patbeg+i);
831 				cpatlen++;
832 			}
833 
834 			start = cstart;
835 			patlen = cpatlen;
836 		}
837 
838 		setfval(rstartloc, (Awkfloat) start);
839 		setfval(rlengthloc, (Awkfloat) patlen);
840 		x = gettemp();
841 		x->tval = NUM;
842 		x->fval = start;
843 	} else if ((n == MATCH && i == 1) || (n == NOTMATCH && i == 0))
844 		x = True;
845 	else
846 		x = False;
847 
848 	tempfree(z);
849 	return x;
850 }
851 
852 
boolop(Node ** a,int n)853 Cell *boolop(Node **a, int n)	/* a[0] || a[1], a[0] && a[1], !a[0] */
854 {
855 	Cell *x, *y;
856 	int i;
857 
858 	x = execute(a[0]);
859 	i = istrue(x);
860 	tempfree(x);
861 	switch (n) {
862 	case BOR:
863 		if (i) return(True);
864 		y = execute(a[1]);
865 		i = istrue(y);
866 		tempfree(y);
867 		if (i) return(True);
868 		else return(False);
869 	case AND:
870 		if ( !i ) return(False);
871 		y = execute(a[1]);
872 		i = istrue(y);
873 		tempfree(y);
874 		if (i) return(True);
875 		else return(False);
876 	case NOT:
877 		if (i) return(False);
878 		else return(True);
879 	default:	/* can't happen */
880 		FATAL("unknown boolean operator %d", n);
881 	}
882 	return 0;	/*NOTREACHED*/
883 }
884 
relop(Node ** a,int n)885 Cell *relop(Node **a, int n)	/* a[0 < a[1], etc. */
886 {
887 	int i;
888 	Cell *x, *y;
889 	Awkfloat j;
890 	bool x_is_nan, y_is_nan;
891 
892 	x = execute(a[0]);
893 	y = execute(a[1]);
894 	x_is_nan = isnan(x->fval);
895 	y_is_nan = isnan(y->fval);
896 	if (x->tval&NUM && y->tval&NUM) {
897 		if ((x_is_nan || y_is_nan) && n != NE)
898 			return(False);
899 		j = x->fval - y->fval;
900 		i = j<0? -1: (j>0? 1: 0);
901 	} else {
902 		i = strcmp(getsval(x), getsval(y));
903 	}
904 	tempfree(x);
905 	tempfree(y);
906 	switch (n) {
907 	case LT:	if (i<0) return(True);
908 			else return(False);
909 	case LE:	if (i<=0) return(True);
910 			else return(False);
911 	case NE:	if (x_is_nan && y_is_nan) return(True);
912 			else if (i!=0) return(True);
913 			else return(False);
914 	case EQ:	if (i == 0) return(True);
915 			else return(False);
916 	case GE:	if (i>=0) return(True);
917 			else return(False);
918 	case GT:	if (i>0) return(True);
919 			else return(False);
920 	default:	/* can't happen */
921 		FATAL("unknown relational operator %d", n);
922 	}
923 	return 0;	/*NOTREACHED*/
924 }
925 
tfree(Cell * a)926 void tfree(Cell *a)	/* free a tempcell */
927 {
928 	if (freeable(a)) {
929 		DPRINTF("freeing %s %s %o\n", NN(a->nval), NN(a->sval), a->tval);
930 		xfree(a->sval);
931 	}
932 	if (a == tmps)
933 		FATAL("tempcell list is curdled");
934 	a->cnext = tmps;
935 	tmps = a;
936 }
937 
gettemp(void)938 Cell *gettemp(void)	/* get a tempcell */
939 {	int i;
940 	Cell *x;
941 
942 	if (!tmps) {
943 		tmps = (Cell *) calloc(100, sizeof(*tmps));
944 		if (!tmps)
945 			FATAL("out of space for temporaries");
946 		for (i = 1; i < 100; i++)
947 			tmps[i-1].cnext = &tmps[i];
948 		tmps[i-1].cnext = NULL;
949 	}
950 	x = tmps;
951 	tmps = x->cnext;
952 	*x = tempcell;
953 	return(x);
954 }
955 
indirect(Node ** a,int n)956 Cell *indirect(Node **a, int n)	/* $( a[0] ) */
957 {
958 	Awkfloat val;
959 	Cell *x;
960 	int m;
961 
962 	x = execute(a[0]);
963 	val = getfval(x);	/* freebsd: defend against super large field numbers */
964 	if ((Awkfloat)INT_MAX < val)
965 		FATAL("trying to access out of range field %s", x->nval);
966 	m = (int) val;
967 	tempfree(x);
968 	x = fieldadr(m);
969 	x->ctype = OCELL;	/* BUG?  why are these needed? */
970 	x->csub = CFLD;
971 	return(x);
972 }
973 
substr(Node ** a,int nnn)974 Cell *substr(Node **a, int nnn)		/* substr(a[0], a[1], a[2]) */
975 {
976 	int k, m, n;
977 	int mb, nb;
978 	char *s;
979 	int temp;
980 	Cell *x, *y, *z = NULL;
981 
982 	x = execute(a[0]);
983 	y = execute(a[1]);
984 	if (a[2] != NULL)
985 		z = execute(a[2]);
986 	s = getsval(x);
987 	k = u8_strlen(s) + 1;
988 	if (k <= 1) {
989 		tempfree(x);
990 		tempfree(y);
991 		if (a[2] != NULL) {
992 			tempfree(z);
993 		}
994 		x = gettemp();
995 		setsval(x, "");
996 		return(x);
997 	}
998 	m = (int) getfval(y);
999 	if (m <= 0)
1000 		m = 1;
1001 	else if (m > k)
1002 		m = k;
1003 	tempfree(y);
1004 	if (a[2] != NULL) {
1005 		n = (int) getfval(z);
1006 		tempfree(z);
1007 	} else
1008 		n = k - 1;
1009 	if (n < 0)
1010 		n = 0;
1011 	else if (n > k - m)
1012 		n = k - m;
1013 	/* m is start, n is length from there */
1014 	DPRINTF("substr: m=%d, n=%d, s=%s\n", m, n, s);
1015 	y = gettemp();
1016 	mb = u8_char2byte(s, m-1); /* byte offset of start char in s */
1017 	nb = u8_char2byte(s, m-1+n);  /* byte offset of end+1 char in s */
1018 
1019 	temp = s[nb];	/* with thanks to John Linderman */
1020 	s[nb] = '\0';
1021 	setsval(y, s + mb);
1022 	s[nb] = temp;
1023 	tempfree(x);
1024 	return(y);
1025 }
1026 
sindex(Node ** a,int nnn)1027 Cell *sindex(Node **a, int nnn)		/* index(a[0], a[1]) */
1028 {
1029 	Cell *x, *y, *z;
1030 	char *s1, *s2, *p1, *p2, *q;
1031 	Awkfloat v = 0.0;
1032 
1033 	x = execute(a[0]);
1034 	s1 = getsval(x);
1035 	y = execute(a[1]);
1036 	s2 = getsval(y);
1037 
1038 	z = gettemp();
1039 	for (p1 = s1; *p1 != '\0'; p1++) {
1040 		for (q = p1, p2 = s2; *p2 != '\0' && *q == *p2; q++, p2++)
1041 			continue;
1042 		if (*p2 == '\0') {
1043 			/* v = (Awkfloat) (p1 - s1 + 1);	 origin 1 */
1044 
1045 		   /* should be a function: used in match() as well */
1046 			int i, len;
1047 			v = 0;
1048 			for (i = 0; i < p1-s1+1; i += len) {
1049 				len = u8_nextlen(s1+i);
1050 				v++;
1051 			}
1052 			break;
1053 		}
1054 	}
1055 	tempfree(x);
1056 	tempfree(y);
1057 	setfval(z, v);
1058 	return(z);
1059 }
1060 
has_utf8(char * s)1061 int has_utf8(char *s)	/* return 1 if s contains any utf-8 (2 bytes or more) character */
1062 {
1063 	int n;
1064 
1065 	for (n = 0; *s != 0; s += n) {
1066 		n = u8_nextlen(s);
1067 		if (n > 1)
1068 			return 1;
1069 	}
1070 	return 0;
1071 }
1072 
1073 #define	MAXNUMSIZE	50
1074 
format(char ** pbuf,int * pbufsize,const char * s,Node * a)1075 int format(char **pbuf, int *pbufsize, const char *s, Node *a)	/* printf-like conversions */
1076 {
1077 	char *fmt;
1078 	char *p, *t;
1079 	const char *os;
1080 	Cell *x;
1081 	int flag = 0, n;
1082 	int fmtwd; /* format width */
1083 	int fmtsz = recsize;
1084 	char *buf = *pbuf;
1085 	int bufsize = *pbufsize;
1086 #define FMTSZ(a)   (fmtsz - ((a) - fmt))
1087 #define BUFSZ(a)   (bufsize - ((a) - buf))
1088 
1089 	static bool first = true;
1090 	static bool have_a_format = false;
1091 
1092 	if (first) {
1093 		char xbuf[100];
1094 
1095 		snprintf(xbuf, sizeof(xbuf), "%a", 42.0);
1096 		have_a_format = (strcmp(xbuf, "0x1.5p+5") == 0);
1097 		first = false;
1098 	}
1099 
1100 	os = s;
1101 	p = buf;
1102 	if ((fmt = (char *) malloc(fmtsz)) == NULL)
1103 		FATAL("out of memory in format()");
1104 	while (*s) {
1105 		adjbuf(&buf, &bufsize, MAXNUMSIZE+1+p-buf, recsize, &p, "format1");
1106 		if (*s != '%') {
1107 			*p++ = *s++;
1108 			continue;
1109 		}
1110 		if (*(s+1) == '%') {
1111 			*p++ = '%';
1112 			s += 2;
1113 			continue;
1114 		}
1115 		fmtwd = atoi(s+1);
1116 		if (fmtwd < 0)
1117 			fmtwd = -fmtwd;
1118 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format2");
1119 		for (t = fmt; (*t++ = *s) != '\0'; s++) {
1120 			if (!adjbuf(&fmt, &fmtsz, MAXNUMSIZE+1+t-fmt, recsize, &t, "format3"))
1121 				FATAL("format item %.30s... ran format() out of memory", os);
1122 			/* Ignore size specifiers */
1123 			if (strchr("hjLlqtz", *s) != NULL) {	/* the ansi panoply */
1124 				t--;
1125 				continue;
1126 			}
1127 			if (isalpha((uschar)*s))
1128 				break;
1129 			if (*s == '$') {
1130 				FATAL("'$' not permitted in awk formats");
1131 			}
1132 			if (*s == '*') {
1133 				if (a == NULL) {
1134 					FATAL("not enough args in printf(%s)", os);
1135 				}
1136 				x = execute(a);
1137 				a = a->nnext;
1138 				snprintf(t - 1, FMTSZ(t - 1),
1139 				    "%d", fmtwd=(int) getfval(x));
1140 				if (fmtwd < 0)
1141 					fmtwd = -fmtwd;
1142 				adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format");
1143 				t = fmt + strlen(fmt);
1144 				tempfree(x);
1145 			}
1146 		}
1147 		*t = '\0';
1148 		if (fmtwd < 0)
1149 			fmtwd = -fmtwd;
1150 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format4");
1151 		switch (*s) {
1152 		case 'a': case 'A':
1153 			if (have_a_format)
1154 				flag = *s;
1155 			else
1156 				flag = 'f';
1157 			break;
1158 		case 'f': case 'e': case 'g': case 'E': case 'G':
1159 			flag = 'f';
1160 			break;
1161 		case 'd': case 'i': case 'o': case 'x': case 'X': case 'u':
1162 			flag = (*s == 'd' || *s == 'i') ? 'd' : 'u';
1163 			*(t-1) = 'j';
1164 			*t = *s;
1165 			*++t = '\0';
1166 			break;
1167 		case 's':
1168 			flag = 's';
1169 			break;
1170 		case 'c':
1171 			flag = 'c';
1172 			break;
1173 		default:
1174 			WARNING("weird printf conversion %s", fmt);
1175 			flag = '?';
1176 			break;
1177 		}
1178 		if (a == NULL)
1179 			FATAL("not enough args in printf(%s)", os);
1180 		x = execute(a);
1181 		a = a->nnext;
1182 		n = MAXNUMSIZE;
1183 		if (fmtwd > n)
1184 			n = fmtwd;
1185 		adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format5");
1186 		switch (flag) {
1187 		case '?':
1188 			snprintf(p, BUFSZ(p), "%s", fmt);	/* unknown, so dump it too */
1189 			t = getsval(x);
1190 			n = strlen(t);
1191 			if (fmtwd > n)
1192 				n = fmtwd;
1193 			adjbuf(&buf, &bufsize, 1+strlen(p)+n+p-buf, recsize, &p, "format6");
1194 			p += strlen(p);
1195 			snprintf(p, BUFSZ(p), "%s", t);
1196 			break;
1197 		case 'a':
1198 		case 'A':
1199 		case 'f':	snprintf(p, BUFSZ(p), fmt, getfval(x)); break;
1200 		case 'd':	snprintf(p, BUFSZ(p), fmt, (intmax_t) getfval(x)); break;
1201 		case 'u':	snprintf(p, BUFSZ(p), fmt, (uintmax_t) getfval(x)); break;
1202 
1203 		case 's': {
1204 			t = getsval(x);
1205 			n = strlen(t);
1206 			/* if simple format or no utf-8 in the string, sprintf works */
1207 			if (!has_utf8(t) || strcmp(fmt,"%s") == 0) {
1208 				if (fmtwd > n)
1209 					n = fmtwd;
1210 				if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7"))
1211 					FATAL("huge string/format (%d chars) in printf %.30s..." \
1212 						" ran format() out of memory", n, t);
1213 				snprintf(p, BUFSZ(p), fmt, t);
1214 				break;
1215 			}
1216 
1217 			/* get here if string has utf-8 chars and fmt is not plain %s */
1218 			/* "%-w.ps", where -, w and .p are all optional */
1219 			/* '0' before the w is a flag character */
1220 			/* fmt points at % */
1221 			int ljust = 0, wid = 0, prec = n, pad = 0;
1222 			char *f = fmt+1;
1223 			if (f[0] == '-') {
1224 				ljust = 1;
1225 				f++;
1226 			}
1227 			// flags '0' and '+' are recognized but skipped
1228 			if (f[0] == '0') {
1229 				f++;
1230 				if (f[0] == '+')
1231 					f++;
1232 			}
1233 			if (f[0] == '+') {
1234 				f++;
1235 				if (f[0] == '0')
1236 					f++;
1237 			}
1238 			if (isdigit(f[0])) { /* there is a wid */
1239 				wid = strtol(f, &f, 10);
1240 			}
1241 			if (f[0] == '.') { /* there is a .prec */
1242 				prec = strtol(++f, &f, 10);
1243 			}
1244 			if (prec > u8_strlen(t))
1245 				prec = u8_strlen(t);
1246 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1247 			int i, k, n;
1248 
1249 			if (ljust) { // print prec chars from t, then pad blanks
1250 				n = u8_char2byte(t, prec);
1251 				for (k = 0; k < n; k++) {
1252 					//putchar(t[k]);
1253 					*p++ = t[k];
1254 				}
1255 				for (i = 0; i < pad; i++) {
1256 					//printf(" ");
1257 					*p++ = ' ';
1258 				}
1259 			} else { // print pad blanks, then prec chars from t
1260 				for (i = 0; i < pad; i++) {
1261 					//printf(" ");
1262 					*p++ = ' ';
1263 				}
1264 				n = u8_char2byte(t, prec);
1265 				for (k = 0; k < n; k++) {
1266 					//putchar(t[k]);
1267 					*p++ = t[k];
1268 				}
1269 			}
1270 			*p = 0;
1271 			break;
1272 		}
1273 
1274                case 'c': {
1275 			/*
1276 			 * If a numeric value is given, awk should just turn
1277 			 * it into a character and print it:
1278 			 *      BEGIN { printf("%c\n", 65) }
1279 			 * prints "A".
1280 			 *
1281 			 * But what if the numeric value is > 128 and
1282 			 * represents a valid Unicode code point?!? We do
1283 			 * our best to convert it back into UTF-8. If we
1284 			 * can't, we output the encoding of the Unicode
1285 			 * "invalid character", 0xFFFD.
1286 			 */
1287 			if (isnum(x)) {
1288 				int charval = (int) getfval(x);
1289 
1290 				if (charval != 0) {
1291 					if (charval < 128 || awk_mb_cur_max == 1)
1292 						snprintf(p, BUFSZ(p), fmt, charval);
1293 					else {
1294 						// possible unicode character
1295 						size_t count;
1296 						char *bs = wide_char_to_byte_str(charval, &count);
1297 
1298 						if (bs == NULL)	{ // invalid character
1299 							// use unicode invalid character, 0xFFFD
1300 							static char invalid_char[] = "\357\277\275";
1301 							bs = invalid_char;
1302 							count = 3;
1303 						}
1304 						t = bs;
1305 						n = count;
1306 						goto format_percent_c;
1307 					}
1308 				} else {
1309 					*p++ = '\0'; /* explicit null byte */
1310 					*p = '\0';   /* next output will start here */
1311 				}
1312 				break;
1313 			}
1314 			t = getsval(x);
1315 			n = u8_nextlen(t);
1316 		format_percent_c:
1317 			if (n < 2) { /* not utf8 */
1318 				snprintf(p, BUFSZ(p), fmt, getsval(x)[0]);
1319 				break;
1320 			}
1321 
1322 			// utf8 character, almost same song and dance as for %s
1323 			int ljust = 0, wid = 0, prec = n, pad = 0;
1324 			char *f = fmt+1;
1325 			if (f[0] == '-') {
1326 				ljust = 1;
1327 				f++;
1328 			}
1329 			// flags '0' and '+' are recognized but skipped
1330 			if (f[0] == '0') {
1331 				f++;
1332 				if (f[0] == '+')
1333 					f++;
1334 			}
1335 			if (f[0] == '+') {
1336 				f++;
1337 				if (f[0] == '0')
1338 					f++;
1339 			}
1340 			if (isdigit(f[0])) { /* there is a wid */
1341 				wid = strtol(f, &f, 10);
1342 			}
1343 			if (f[0] == '.') { /* there is a .prec */
1344 				prec = strtol(++f, &f, 10);
1345 			}
1346 			if (prec > 1)           // %c --> only one character
1347 				prec = 1;
1348 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1349 			int i;
1350 
1351 			if (ljust) { // print one char from t, then pad blanks
1352 				for (i = 0; i < n; i++)
1353 					*p++ = t[i];
1354 				for (i = 0; i < pad; i++) {
1355 					//printf(" ");
1356 					*p++ = ' ';
1357 				}
1358 			} else { // print pad blanks, then prec chars from t
1359 				for (i = 0; i < pad; i++) {
1360 					//printf(" ");
1361 					*p++ = ' ';
1362 				}
1363 				for (i = 0; i < n; i++)
1364 					*p++ = t[i];
1365 			}
1366 			*p = 0;
1367 			break;
1368 		}
1369 		default:
1370 			FATAL("can't happen: bad conversion %c in format()", flag);
1371 		}
1372 
1373 		tempfree(x);
1374 		p += strlen(p);
1375 		s++;
1376 	}
1377 	*p = '\0';
1378 	free(fmt);
1379 	for ( ; a; a = a->nnext) {		/* evaluate any remaining args */
1380 		x = execute(a);
1381 		tempfree(x);
1382 	}
1383 	*pbuf = buf;
1384 	*pbufsize = bufsize;
1385 	return p - buf;
1386 }
1387 
awksprintf(Node ** a,int n)1388 Cell *awksprintf(Node **a, int n)		/* sprintf(a[0]) */
1389 {
1390 	Cell *x;
1391 	Node *y;
1392 	char *buf;
1393 	int bufsz=3*recsize;
1394 
1395 	if ((buf = (char *) malloc(bufsz)) == NULL)
1396 		FATAL("out of memory in awksprintf");
1397 	y = a[0]->nnext;
1398 	x = execute(a[0]);
1399 	if (format(&buf, &bufsz, getsval(x), y) == -1)
1400 		FATAL("sprintf string %.30s... too long.  can't happen.", buf);
1401 	tempfree(x);
1402 	x = gettemp();
1403 	x->sval = buf;
1404 	x->tval = STR;
1405 	return(x);
1406 }
1407 
awkprintf(Node ** a,int n)1408 Cell *awkprintf(Node **a, int n)		/* printf */
1409 {	/* a[0] is list of args, starting with format string */
1410 	/* a[1] is redirection operator, a[2] is redirection file */
1411 	FILE *fp;
1412 	Cell *x;
1413 	Node *y;
1414 	char *buf;
1415 	int len;
1416 	int bufsz=3*recsize;
1417 
1418 	if ((buf = (char *) malloc(bufsz)) == NULL)
1419 		FATAL("out of memory in awkprintf");
1420 	y = a[0]->nnext;
1421 	x = execute(a[0]);
1422 	if ((len = format(&buf, &bufsz, getsval(x), y)) == -1)
1423 		FATAL("printf string %.30s... too long.  can't happen.", buf);
1424 	tempfree(x);
1425 	if (a[1] == NULL) {
1426 		/* fputs(buf, stdout); */
1427 		fwrite(buf, len, 1, stdout);
1428 		if (ferror(stdout))
1429 			FATAL("write error on stdout");
1430 	} else {
1431 		fp = redirect(ptoi(a[1]), a[2]);
1432 		/* fputs(buf, fp); */
1433 		fwrite(buf, len, 1, fp);
1434 		fflush(fp);
1435 		if (ferror(fp))
1436 			FATAL("write error on %s", filename(fp));
1437 	}
1438 	free(buf);
1439 	return(True);
1440 }
1441 
arith(Node ** a,int n)1442 Cell *arith(Node **a, int n)	/* a[0] + a[1], etc.  also -a[0] */
1443 {
1444 	Awkfloat i, j = 0;
1445 	double v;
1446 	Cell *x, *y, *z;
1447 
1448 	x = execute(a[0]);
1449 	i = getfval(x);
1450 	tempfree(x);
1451 	if (n != UMINUS && n != UPLUS) {
1452 		y = execute(a[1]);
1453 		j = getfval(y);
1454 		tempfree(y);
1455 	}
1456 	z = gettemp();
1457 	switch (n) {
1458 	case ADD:
1459 		i += j;
1460 		break;
1461 	case MINUS:
1462 		i -= j;
1463 		break;
1464 	case MULT:
1465 		i *= j;
1466 		break;
1467 	case DIVIDE:
1468 		if (j == 0)
1469 			FATAL("division by zero");
1470 		i /= j;
1471 		break;
1472 	case MOD:
1473 		if (j == 0)
1474 			FATAL("division by zero in mod");
1475 		modf(i/j, &v);
1476 		i = i - j * v;
1477 		break;
1478 	case UMINUS:
1479 		i = -i;
1480 		break;
1481 	case UPLUS: /* handled by getfval(), above */
1482 		break;
1483 	case POWER:
1484 		if (j >= 0 && modf(j, &v) == 0.0)	/* pos integer exponent */
1485 			i = ipow(i, (int) j);
1486                else {
1487 			errno = 0;
1488 			i = errcheck(pow(i, j), "pow");
1489                }
1490 		break;
1491 	default:	/* can't happen */
1492 		FATAL("illegal arithmetic operator %d", n);
1493 	}
1494 	setfval(z, i);
1495 	return(z);
1496 }
1497 
ipow(double x,int n)1498 double ipow(double x, int n)	/* x**n.  ought to be done by pow, but isn't always */
1499 {
1500 	double v;
1501 
1502 	if (n <= 0)
1503 		return 1;
1504 	v = ipow(x, n/2);
1505 	if (n % 2 == 0)
1506 		return v * v;
1507 	else
1508 		return x * v * v;
1509 }
1510 
incrdecr(Node ** a,int n)1511 Cell *incrdecr(Node **a, int n)		/* a[0]++, etc. */
1512 {
1513 	Cell *x, *z;
1514 	int k;
1515 	Awkfloat xf;
1516 
1517 	x = execute(a[0]);
1518 	xf = getfval(x);
1519 	k = (n == PREINCR || n == POSTINCR) ? 1 : -1;
1520 	if (n == PREINCR || n == PREDECR) {
1521 		setfval(x, xf + k);
1522 		return(x);
1523 	}
1524 	z = gettemp();
1525 	setfval(z, xf);
1526 	setfval(x, xf + k);
1527 	tempfree(x);
1528 	return(z);
1529 }
1530 
assign(Node ** a,int n)1531 Cell *assign(Node **a, int n)	/* a[0] = a[1], a[0] += a[1], etc. */
1532 {		/* this is subtle; don't muck with it. */
1533 	Cell *x, *y;
1534 	Awkfloat xf, yf;
1535 	double v;
1536 
1537 	y = execute(a[1]);
1538 	x = execute(a[0]);
1539 	if (n == ASSIGN) {	/* ordinary assignment */
1540 		if (x == y && !(x->tval & (FLD|REC)) && x != nfloc)
1541 			;	/* self-assignment: leave alone unless it's a field or NF */
1542 		else if ((y->tval & (STR|NUM)) == (STR|NUM)) {
1543 			yf = getfval(y);
1544 			setsval(x, getsval(y));
1545 			x->fval = yf;
1546 			x->tval |= NUM;
1547 		}
1548 		else if (isstr(y))
1549 			setsval(x, getsval(y));
1550 		else if (isnum(y))
1551 			setfval(x, getfval(y));
1552 		else
1553 			funnyvar(y, "read value of");
1554 		tempfree(y);
1555 		return(x);
1556 	}
1557 	xf = getfval(x);
1558 	yf = getfval(y);
1559 	switch (n) {
1560 	case ADDEQ:
1561 		xf += yf;
1562 		break;
1563 	case SUBEQ:
1564 		xf -= yf;
1565 		break;
1566 	case MULTEQ:
1567 		xf *= yf;
1568 		break;
1569 	case DIVEQ:
1570 		if (yf == 0)
1571 			FATAL("division by zero in /=");
1572 		xf /= yf;
1573 		break;
1574 	case MODEQ:
1575 		if (yf == 0)
1576 			FATAL("division by zero in %%=");
1577 		modf(xf/yf, &v);
1578 		xf = xf - yf * v;
1579 		break;
1580 	case POWEQ:
1581 		if (yf >= 0 && modf(yf, &v) == 0.0)	/* pos integer exponent */
1582 			xf = ipow(xf, (int) yf);
1583                else {
1584 			errno = 0;
1585 			xf = errcheck(pow(xf, yf), "pow");
1586                }
1587 		break;
1588 	default:
1589 		FATAL("illegal assignment operator %d", n);
1590 		break;
1591 	}
1592 	tempfree(y);
1593 	setfval(x, xf);
1594 	return(x);
1595 }
1596 
cat(Node ** a,int q)1597 Cell *cat(Node **a, int q)	/* a[0] cat a[1] */
1598 {
1599 	Cell *x, *y, *z;
1600 	int n1, n2;
1601 	char *s = NULL;
1602 	int ssz = 0;
1603 
1604 	x = execute(a[0]);
1605 	n1 = strlen(getsval(x));
1606 	adjbuf(&s, &ssz, n1 + 1, recsize, 0, "cat1");
1607 	memcpy(s, x->sval, n1);
1608 
1609 	tempfree(x);
1610 
1611 	y = execute(a[1]);
1612 	n2 = strlen(getsval(y));
1613 	adjbuf(&s, &ssz, n1 + n2 + 1, recsize, 0, "cat2");
1614 	memcpy(s + n1, y->sval, n2);
1615 	s[n1 + n2] = '\0';
1616 
1617 	tempfree(y);
1618 
1619 	z = gettemp();
1620 	z->sval = s;
1621 	z->tval = STR;
1622 
1623 	return(z);
1624 }
1625 
pastat(Node ** a,int n)1626 Cell *pastat(Node **a, int n)	/* a[0] { a[1] } */
1627 {
1628 	Cell *x;
1629 
1630 	if (a[0] == NULL)
1631 		x = execute(a[1]);
1632 	else {
1633 		x = execute(a[0]);
1634 		if (istrue(x)) {
1635 			tempfree(x);
1636 			x = execute(a[1]);
1637 		}
1638 	}
1639 	return x;
1640 }
1641 
dopa2(Node ** a,int n)1642 Cell *dopa2(Node **a, int n)	/* a[0], a[1] { a[2] } */
1643 {
1644 	Cell *x;
1645 	int pair;
1646 
1647 	pair = ptoi(a[3]);
1648 	if (pairstack[pair] == 0) {
1649 		x = execute(a[0]);
1650 		if (istrue(x))
1651 			pairstack[pair] = 1;
1652 		tempfree(x);
1653 	}
1654 	if (pairstack[pair] == 1) {
1655 		x = execute(a[1]);
1656 		if (istrue(x))
1657 			pairstack[pair] = 0;
1658 		tempfree(x);
1659 		x = execute(a[2]);
1660 		return(x);
1661 	}
1662 	return(False);
1663 }
1664 
split(Node ** a,int nnn)1665 Cell *split(Node **a, int nnn)	/* split(a[0], a[1], a[2]); a[3] is type */
1666 {
1667 	Cell *x = NULL, *y, *ap;
1668 	const char *s, *origs, *t;
1669 	const char *fs = NULL;
1670 	char *origfs = NULL;
1671 	int sep;
1672 	char temp, num[50];
1673 	int n, tempstat, arg3type;
1674 	int j;
1675 	double result;
1676 
1677 	y = execute(a[0]);	/* source string */
1678 	origs = s = strdup(getsval(y));
1679 	tempfree(y);
1680 	arg3type = ptoi(a[3]);
1681 	if (a[2] == NULL) {		/* BUG: CSV should override implicit fs but not explicit */
1682 		fs = getsval(fsloc);
1683 	} else if (arg3type == STRING) {	/* split(str,arr,"string") */
1684 		x = execute(a[2]);
1685 		fs = origfs = strdup(getsval(x));
1686 		tempfree(x);
1687 	} else if (arg3type == REGEXPR) {
1688 		fs = "(regexpr)";	/* split(str,arr,/regexpr/) */
1689 	} else {
1690 		FATAL("illegal type of split");
1691 	}
1692 	sep = *fs;
1693 	ap = execute(a[1]);	/* array name */
1694 /* BUG 7/26/22: this appears not to reset array: see C1/asplit */
1695 	freesymtab(ap);
1696 	DPRINTF("split: s=|%s|, a=%s, sep=|%s|\n", s, NN(ap->nval), fs);
1697 	ap->tval &= ~STR;
1698 	ap->tval |= ARR;
1699 	ap->sval = (char *) makesymtab(NSYMTAB);
1700 
1701 	n = 0;
1702         if (arg3type == REGEXPR && strlen((char*)((fa*)a[2])->restr) == 0) {
1703 		/* split(s, a, //); have to arrange that it looks like empty sep */
1704 		arg3type = 0;
1705 		fs = "";
1706 		sep = 0;
1707 	}
1708 	if (*s != '\0' && (strlen(fs) > 1 || arg3type == REGEXPR)) {	/* reg expr */
1709 		fa *pfa;
1710 		if (arg3type == REGEXPR) {	/* it's ready already */
1711 			pfa = (fa *) a[2];
1712 		} else {
1713 			pfa = makedfa(fs, 1);
1714 		}
1715 		if (nematch(pfa,s)) {
1716 			tempstat = pfa->initstat;
1717 			pfa->initstat = 2;
1718 			do {
1719 				n++;
1720 				snprintf(num, sizeof(num), "%d", n);
1721 				temp = *patbeg;
1722 				setptr(patbeg, '\0');
1723 				if (is_number(s, & result))
1724 					setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1725 				else
1726 					setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1727 				setptr(patbeg, temp);
1728 				s = patbeg + patlen;
1729 				if (*(patbeg+patlen-1) == '\0' || *s == '\0') {
1730 					n++;
1731 					snprintf(num, sizeof(num), "%d", n);
1732 					setsymtab(num, "", 0.0, STR, (Array *) ap->sval);
1733 					pfa->initstat = tempstat;
1734 					goto spdone;
1735 				}
1736 			} while (nematch(pfa,s));
1737 			pfa->initstat = tempstat; 	/* bwk: has to be here to reset */
1738 							/* cf gsub and refldbld */
1739 		}
1740 		n++;
1741 		snprintf(num, sizeof(num), "%d", n);
1742 		if (is_number(s, & result))
1743 			setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1744 		else
1745 			setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1746   spdone:
1747 		pfa = NULL;
1748 
1749 	} else if (a[2] == NULL && CSV) {	/* CSV only if no explicit separator */
1750 		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
1751 		for (;;) {
1752 			char *fr = newt;
1753 			n++;
1754 			if (*s == '"' ) { /* start of "..." */
1755 				for (s++ ; *s != '\0'; ) {
1756 					if (*s == '"' && s[1] != '\0' && s[1] == '"') {
1757 						s += 2; /* doubled quote */
1758 						*fr++ = '"';
1759 					} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {
1760 						s++; /* skip over closing quote */
1761 						break;
1762 					} else {
1763 						*fr++ = *s++;
1764 					}
1765 				}
1766 				*fr++ = 0;
1767 			} else {	/* unquoted field */
1768 				while (*s != ',' && *s != '\0')
1769 					*fr++ = *s++;
1770 				*fr++ = 0;
1771 			}
1772 			snprintf(num, sizeof(num), "%d", n);
1773 			if (is_number(newt, &result))
1774 				setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
1775 			else
1776 				setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
1777 			if (*s++ == '\0')
1778 				break;
1779 		}
1780 		free(newt);
1781 
1782 	} else if (!CSV && sep == ' ') { /* usual case: split on white space */
1783 		for (n = 0; ; ) {
1784 #define ISWS(c)	((c) == ' ' || (c) == '\t' || (c) == '\n')
1785 			while (ISWS(*s))
1786 				s++;
1787 			if (*s == '\0')
1788 				break;
1789 			n++;
1790 			t = s;
1791 			do
1792 				s++;
1793 			while (*s != '\0' && !ISWS(*s));
1794 			temp = *s;
1795 			setptr(s, '\0');
1796 			snprintf(num, sizeof(num), "%d", n);
1797 			if (is_number(t, & result))
1798 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1799 			else
1800 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1801 			setptr(s, temp);
1802 			if (*s != '\0')
1803 				s++;
1804 		}
1805 
1806 	} else if (sep == 0) {	/* new: split(s, a, "") => 1 char/elem */
1807 		for (n = 0; *s != '\0'; s += u8_nextlen(s)) {
1808 			char buf[10];
1809 			n++;
1810 			snprintf(num, sizeof(num), "%d", n);
1811 
1812 			for (j = 0; j < u8_nextlen(s); j++) {
1813 				buf[j] = s[j];
1814 			}
1815 			buf[j] = '\0';
1816 
1817 			if (isdigit((uschar)buf[0]))
1818 				setsymtab(num, buf, atof(buf), STR|NUM, (Array *) ap->sval);
1819 			else
1820 				setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);
1821 		}
1822 
1823 	} else if (*s != '\0') {  /* some random single character */
1824 		for (;;) {
1825 			n++;
1826 			t = s;
1827 			while (*s != sep && *s != '\0')
1828 				s++;
1829 			temp = *s;
1830 			setptr(s, '\0');
1831 			snprintf(num, sizeof(num), "%d", n);
1832 			if (is_number(t, & result))
1833 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1834 			else
1835 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1836 			setptr(s, temp);
1837 			if (*s++ == '\0')
1838 				break;
1839 		}
1840 	}
1841 	tempfree(ap);
1842 	xfree(origs);
1843 	xfree(origfs);
1844 	x = gettemp();
1845 	x->tval = NUM;
1846 	x->fval = n;
1847 	return(x);
1848 }
1849 
condexpr(Node ** a,int n)1850 Cell *condexpr(Node **a, int n)	/* a[0] ? a[1] : a[2] */
1851 {
1852 	Cell *x;
1853 
1854 	x = execute(a[0]);
1855 	if (istrue(x)) {
1856 		tempfree(x);
1857 		x = execute(a[1]);
1858 	} else {
1859 		tempfree(x);
1860 		x = execute(a[2]);
1861 	}
1862 	return(x);
1863 }
1864 
ifstat(Node ** a,int n)1865 Cell *ifstat(Node **a, int n)	/* if (a[0]) a[1]; else a[2] */
1866 {
1867 	Cell *x;
1868 
1869 	x = execute(a[0]);
1870 	if (istrue(x)) {
1871 		tempfree(x);
1872 		x = execute(a[1]);
1873 	} else if (a[2] != NULL) {
1874 		tempfree(x);
1875 		x = execute(a[2]);
1876 	}
1877 	return(x);
1878 }
1879 
whilestat(Node ** a,int n)1880 Cell *whilestat(Node **a, int n)	/* while (a[0]) a[1] */
1881 {
1882 	Cell *x;
1883 
1884 	for (;;) {
1885 		x = execute(a[0]);
1886 		if (!istrue(x))
1887 			return(x);
1888 		tempfree(x);
1889 		x = execute(a[1]);
1890 		if (isbreak(x)) {
1891 			x = True;
1892 			return(x);
1893 		}
1894 		if (isnext(x) || isexit(x) || isret(x))
1895 			return(x);
1896 		tempfree(x);
1897 	}
1898 }
1899 
dostat(Node ** a,int n)1900 Cell *dostat(Node **a, int n)	/* do a[0]; while(a[1]) */
1901 {
1902 	Cell *x;
1903 
1904 	for (;;) {
1905 		x = execute(a[0]);
1906 		if (isbreak(x))
1907 			return True;
1908 		if (isnext(x) || isexit(x) || isret(x))
1909 			return(x);
1910 		tempfree(x);
1911 		x = execute(a[1]);
1912 		if (!istrue(x))
1913 			return(x);
1914 		tempfree(x);
1915 	}
1916 }
1917 
forstat(Node ** a,int n)1918 Cell *forstat(Node **a, int n)	/* for (a[0]; a[1]; a[2]) a[3] */
1919 {
1920 	Cell *x;
1921 
1922 	x = execute(a[0]);
1923 	tempfree(x);
1924 	for (;;) {
1925 		if (a[1]!=NULL) {
1926 			x = execute(a[1]);
1927 			if (!istrue(x)) return(x);
1928 			else tempfree(x);
1929 		}
1930 		x = execute(a[3]);
1931 		if (isbreak(x))		/* turn off break */
1932 			return True;
1933 		if (isnext(x) || isexit(x) || isret(x))
1934 			return(x);
1935 		tempfree(x);
1936 		x = execute(a[2]);
1937 		tempfree(x);
1938 	}
1939 }
1940 
instat(Node ** a,int n)1941 Cell *instat(Node **a, int n)	/* for (a[0] in a[1]) a[2] */
1942 {
1943 	Cell *x, *vp, *arrayp, *cp, *ncp;
1944 	Array *tp;
1945 	int i;
1946 
1947 	vp = execute(a[0]);
1948 	arrayp = execute(a[1]);
1949 	if (!isarr(arrayp)) {
1950 		return True;
1951 	}
1952 	tp = (Array *) arrayp->sval;
1953 	tempfree(arrayp);
1954 	for (i = 0; i < tp->size; i++) {	/* this routine knows too much */
1955 		for (cp = tp->tab[i]; cp != NULL; cp = ncp) {
1956 			setsval(vp, cp->nval);
1957 			ncp = cp->cnext;
1958 			x = execute(a[2]);
1959 			if (isbreak(x)) {
1960 				tempfree(vp);
1961 				return True;
1962 			}
1963 			if (isnext(x) || isexit(x) || isret(x)) {
1964 				tempfree(vp);
1965 				return(x);
1966 			}
1967 			tempfree(x);
1968 		}
1969 	}
1970 	return True;
1971 }
1972 
nawk_convert(const char * s,int (* fun_c)(int),wint_t (* fun_wc)(wint_t))1973 static char *nawk_convert(const char *s, int (*fun_c)(int),
1974     wint_t (*fun_wc)(wint_t))
1975 {
1976 	char *buf      = NULL;
1977 	char *pbuf     = NULL;
1978 	const char *ps = NULL;
1979 	size_t n       = 0;
1980 	wchar_t wc;
1981 	const size_t sz = awk_mb_cur_max;
1982 	int unused;
1983 
1984 	if (sz == 1) {
1985 		buf = tostring(s);
1986 
1987 		for (pbuf = buf; *pbuf; pbuf++)
1988 			*pbuf = fun_c((uschar)*pbuf);
1989 
1990 		return buf;
1991 	} else {
1992 		/* upper/lower character may be shorter/longer */
1993 		buf = tostringN(s, strlen(s) * sz + 1);
1994 
1995 		(void) mbtowc(NULL, NULL, 0);	/* reset internal state */
1996 		/*
1997 		 * Reset internal state here too.
1998 		 * Assign result to avoid a compiler warning. (Casting to void
1999 		 * doesn't work.)
2000 		 * Increment said variable to avoid a different warning.
2001 		 */
2002 		unused = wctomb(NULL, L'\0');
2003 		unused++;
2004 
2005 		ps   = s;
2006 		pbuf = buf;
2007 		while (n = mbtowc(&wc, ps, sz),
2008 		       n > 0 && n != (size_t)-1 && n != (size_t)-2)
2009 		{
2010 			ps += n;
2011 
2012 			n = wctomb(pbuf, fun_wc(wc));
2013 			if (n == (size_t)-1)
2014 				FATAL("illegal wide character %s", s);
2015 
2016 			pbuf += n;
2017 		}
2018 
2019 		*pbuf = '\0';
2020 
2021 		if (n)
2022 			FATAL("illegal byte sequence %s", s);
2023 
2024 		return buf;
2025 	}
2026 }
2027 
2028 #ifdef __DJGPP__
towupper(wint_t wc)2029 static wint_t towupper(wint_t wc)
2030 {
2031 	if (wc >= 0 && wc < 256)
2032 		return toupper(wc & 0xFF);
2033 
2034 	return wc;
2035 }
2036 
towlower(wint_t wc)2037 static wint_t towlower(wint_t wc)
2038 {
2039 	if (wc >= 0 && wc < 256)
2040 		return tolower(wc & 0xFF);
2041 
2042 	return wc;
2043 }
2044 #endif
2045 
nawk_toupper(const char * s)2046 static char *nawk_toupper(const char *s)
2047 {
2048 	return nawk_convert(s, toupper, towupper);
2049 }
2050 
nawk_tolower(const char * s)2051 static char *nawk_tolower(const char *s)
2052 {
2053 	return nawk_convert(s, tolower, towlower);
2054 }
2055 
2056 
2057 
bltin(Node ** a,int n)2058 Cell *bltin(Node **a, int n)	/* builtin functions. a[0] is type, a[1] is arg list */
2059 {
2060 	Cell *x, *y;
2061 	Awkfloat u = 0;
2062 	int t, sz;
2063 	Awkfloat tmp;
2064 	char *buf, *fmt;
2065 	Node *nextarg;
2066 	FILE *fp;
2067 	int status = 0;
2068 	time_t tv;
2069 	struct tm *tm, tmbuf;
2070 	int estatus = 0;
2071 
2072 	t = ptoi(a[0]);
2073 	x = execute(a[1]);
2074 	nextarg = a[1]->nnext;
2075 	switch (t) {
2076 	case FLENGTH:
2077 		if (isarr(x))
2078 			u = ((Array *) x->sval)->nelem;	/* GROT.  should be function*/
2079 		else
2080 			u = u8_strlen(getsval(x));
2081 		break;
2082 	case FLOG:
2083 		errno = 0;
2084 		u = errcheck(log(getfval(x)), "log");
2085 		break;
2086 	case FINT:
2087 		modf(getfval(x), &u); break;
2088 	case FEXP:
2089 		errno = 0;
2090 		u = errcheck(exp(getfval(x)), "exp");
2091 		break;
2092 	case FSQRT:
2093 		errno = 0;
2094 		u = errcheck(sqrt(getfval(x)), "sqrt");
2095 		break;
2096 	case FSIN:
2097 		u = sin(getfval(x)); break;
2098 	case FCOS:
2099 		u = cos(getfval(x)); break;
2100 	case FATAN:
2101 		if (nextarg == NULL) {
2102 			WARNING("atan2 requires two arguments; returning 1.0");
2103 			u = 1.0;
2104 		} else {
2105 			y = execute(a[1]->nnext);
2106 			u = atan2(getfval(x), getfval(y));
2107 			tempfree(y);
2108 			nextarg = nextarg->nnext;
2109 		}
2110 		break;
2111 	case FCOMPL:
2112 		u = ~((int)getfval(x));
2113 		break;
2114 	case FAND:
2115 		if (nextarg == 0) {
2116 			WARNING("and requires two arguments; returning 0");
2117 			u = 0;
2118 			break;
2119 		}
2120 		y = execute(a[1]->nnext);
2121 		u = ((int)getfval(x)) & ((int)getfval(y));
2122 		tempfree(y);
2123 		nextarg = nextarg->nnext;
2124 		break;
2125 	case FFOR:
2126 		if (nextarg == 0) {
2127 			WARNING("or requires two arguments; returning 0");
2128 			u = 0;
2129 			break;
2130 		}
2131 		y = execute(a[1]->nnext);
2132 		u = ((int)getfval(x)) | ((int)getfval(y));
2133 		tempfree(y);
2134 		nextarg = nextarg->nnext;
2135 		break;
2136 	case FXOR:
2137 		if (nextarg == 0) {
2138 			WARNING("xor requires two arguments; returning 0");
2139 			u = 0;
2140 			break;
2141 		}
2142 		y = execute(a[1]->nnext);
2143 		u = ((int)getfval(x)) ^ ((int)getfval(y));
2144 		tempfree(y);
2145 		nextarg = nextarg->nnext;
2146 		break;
2147 	case FLSHIFT:
2148 		if (nextarg == 0) {
2149 			WARNING("lshift requires two arguments; returning 0");
2150 			u = 0;
2151 			break;
2152 		}
2153 		y = execute(a[1]->nnext);
2154 		u = ((int)getfval(x)) << ((int)getfval(y));
2155 		tempfree(y);
2156 		nextarg = nextarg->nnext;
2157 		break;
2158 	case FRSHIFT:
2159 		if (nextarg == 0) {
2160 			WARNING("rshift requires two arguments; returning 0");
2161 			u = 0;
2162 			break;
2163 		}
2164 		y = execute(a[1]->nnext);
2165 		u = ((int)getfval(x)) >> ((int)getfval(y));
2166 		tempfree(y);
2167 		nextarg = nextarg->nnext;
2168 		break;
2169 	case FSYSTEM:
2170 		fflush(stdout);		/* in case something is buffered already */
2171 		estatus = status = system(getsval(x));
2172 		if (status != -1) {
2173 			if (WIFEXITED(status)) {
2174 				estatus = WEXITSTATUS(status);
2175 			} else if (WIFSIGNALED(status)) {
2176 				estatus = WTERMSIG(status) + 256;
2177 #ifdef WCOREDUMP
2178 				if (WCOREDUMP(status))
2179 					estatus += 256;
2180 #endif
2181 			} else	/* something else?!? */
2182 				estatus = 0;
2183 		}
2184 		/* else estatus was set to -1 */
2185 		u = estatus;
2186 		break;
2187 	case FRAND:
2188 		/* random() returns numbers in [0..2^31-1]
2189 		 * in order to get a number in [0, 1), divide it by 2^31
2190 		 */
2191 		u = (Awkfloat) random() / (0x7fffffffL + 0x1UL);
2192 		break;
2193 	case FSRAND:
2194 		if (isrec(x))	/* no argument provided */
2195 			u = time((time_t *)0);
2196 		else
2197 			u = getfval(x);
2198 		tmp = u;
2199 		srandom((unsigned long) u);
2200 		u = srand_seed;
2201 		srand_seed = tmp;
2202 		break;
2203 	case FTOUPPER:
2204 	case FTOLOWER:
2205 		if (t == FTOUPPER)
2206 			buf = nawk_toupper(getsval(x));
2207 		else
2208 			buf = nawk_tolower(getsval(x));
2209 		tempfree(x);
2210 		x = gettemp();
2211 		setsval(x, buf);
2212 		free(buf);
2213 		return x;
2214 	case FFLUSH:
2215 		if (isrec(x) || strlen(getsval(x)) == 0) {
2216 			flush_all();	/* fflush() or fflush("") -> all */
2217 			u = 0;
2218 		} else if ((fp = openfile(FFLUSH, getsval(x), NULL)) == NULL)
2219 			u = EOF;
2220 		else
2221 			u = fflush(fp);
2222 		break;
2223 	case FMKTIME:
2224 		memset(&tmbuf, 0, sizeof(tmbuf));
2225 		tm = &tmbuf;
2226 		t = sscanf(getsval(x), "%d %d %d %d %d %d %d",
2227 		    &tm->tm_year, &tm->tm_mon, &tm->tm_mday, &tm->tm_hour,
2228 		    &tm->tm_min, &tm->tm_sec, &tm->tm_isdst);
2229 		switch (t) {
2230 		case 6:
2231 			tm->tm_isdst = -1;	/* let mktime figure it out */
2232 			/* FALLTHROUGH */
2233 		case 7:
2234 			tm->tm_year -= 1900;
2235 			tm->tm_mon--;
2236 			u = mktime(tm);
2237 			break;
2238 		default:
2239 			u = -1;
2240 			break;
2241 		}
2242 		break;
2243 	case FSYSTIME:
2244 		u = time((time_t *) 0);
2245 		break;
2246 	case FSTRFTIME:
2247 		/* strftime([format [,timestamp]]) */
2248 		if (nextarg) {
2249 			y = execute(nextarg);
2250 			nextarg = nextarg->nnext;
2251 			tv = (time_t) getfval(y);
2252 			tempfree(y);
2253 		} else
2254 			tv = time((time_t *) 0);
2255 		tm = localtime(&tv);
2256 		if (tm == NULL)
2257 			FATAL("bad time %ld", (long)tv);
2258 
2259 		if (isrec(x)) {
2260 			/* format argument not provided, use default */
2261 			fmt = tostring("%a %b %d %H:%M:%S %Z %Y");
2262 		} else
2263 			fmt = tostring(getsval(x));
2264 
2265 		sz = 32;
2266 		buf = NULL;
2267 		do {
2268 			if ((buf = realloc(buf, (sz *= 2))) == NULL)
2269 				FATAL("out of memory in strftime");
2270 		} while (strftime(buf, sz, fmt, tm) == 0 && fmt[0] != '\0');
2271 
2272 		y = gettemp();
2273 		setsval(y, buf);
2274 		free(fmt);
2275 		free(buf);
2276 
2277 		return y;
2278 	default:	/* can't happen */
2279 		FATAL("illegal function type %d", t);
2280 		break;
2281 	}
2282 	tempfree(x);
2283 	x = gettemp();
2284 	setfval(x, u);
2285 	if (nextarg != NULL) {
2286 		WARNING("warning: function has too many arguments");
2287 		for ( ; nextarg; nextarg = nextarg->nnext) {
2288 			y = execute(nextarg);
2289 			tempfree(y);
2290 		}
2291 	}
2292 	return(x);
2293 }
2294 
printstat(Node ** a,int n)2295 Cell *printstat(Node **a, int n)	/* print a[0] */
2296 {
2297 	Node *x;
2298 	Cell *y;
2299 	FILE *fp;
2300 
2301 	if (a[1] == NULL)	/* a[1] is redirection operator, a[2] is file */
2302 		fp = stdout;
2303 	else
2304 		fp = redirect(ptoi(a[1]), a[2]);
2305 	for (x = a[0]; x != NULL; x = x->nnext) {
2306 		y = execute(x);
2307 		fputs(getpssval(y), fp);
2308 		tempfree(y);
2309 		if (x->nnext == NULL)
2310 			fputs(getsval(orsloc), fp);
2311 		else
2312 			fputs(getsval(ofsloc), fp);
2313 	}
2314 	if (a[1] != NULL)
2315 		fflush(fp);
2316 	if (ferror(fp))
2317 		FATAL("write error on %s", filename(fp));
2318 	return(True);
2319 }
2320 
nullproc(Node ** a,int n)2321 Cell *nullproc(Node **a, int n)
2322 {
2323 	return 0;
2324 }
2325 
2326 
redirect(int a,Node * b)2327 FILE *redirect(int a, Node *b)	/* set up all i/o redirections */
2328 {
2329 	FILE *fp;
2330 	Cell *x;
2331 	char *fname;
2332 
2333 	x = execute(b);
2334 	fname = getsval(x);
2335 	fp = openfile(a, fname, NULL);
2336 	if (fp == NULL)
2337 		FATAL("can't open file %s", fname);
2338 	tempfree(x);
2339 	return fp;
2340 }
2341 
2342 struct files {
2343 	FILE	*fp;
2344 	const char	*fname;
2345 	int	mode;	/* '|', 'a', 'w' => LE/LT, GT */
2346 } *files;
2347 
2348 size_t nfiles;
2349 
stdinit(void)2350 static void stdinit(void)	/* in case stdin, etc., are not constants */
2351 {
2352 	nfiles = FOPEN_MAX;
2353 	files = (struct files *) calloc(nfiles, sizeof(*files));
2354 	if (files == NULL)
2355 		FATAL("can't allocate file memory for %zu files", nfiles);
2356         files[0].fp = stdin;
2357 	files[0].fname = tostring("/dev/stdin");
2358 	files[0].mode = LT;
2359         files[1].fp = stdout;
2360 	files[1].fname = tostring("/dev/stdout");
2361 	files[1].mode = GT;
2362         files[2].fp = stderr;
2363 	files[2].fname = tostring("/dev/stderr");
2364 	files[2].mode = GT;
2365 }
2366 
openfile(int a,const char * us,bool * pnewflag)2367 FILE *openfile(int a, const char *us, bool *pnewflag)
2368 {
2369 	const char *s = us;
2370 	size_t i;
2371 	int m;
2372 	FILE *fp = NULL;
2373 	struct stat sbuf;
2374 
2375 	if (*s == '\0')
2376 		FATAL("null file name in print or getline");
2377 
2378 	for (i = 0; i < nfiles; i++)
2379 		if (files[i].fname && strcmp(s, files[i].fname) == 0 &&
2380 		    (a == files[i].mode || (a==APPEND && files[i].mode==GT) ||
2381 		     a == FFLUSH)) {
2382 			if (pnewflag)
2383 				*pnewflag = false;
2384 			return files[i].fp;
2385 		}
2386 	if (a == FFLUSH)	/* didn't find it, so don't create it! */
2387 		return NULL;
2388 	for (i = 0; i < nfiles; i++)
2389 		if (files[i].fp == NULL)
2390 			break;
2391 	if (i >= nfiles) {
2392 		struct files *nf;
2393 		size_t nnf = nfiles + FOPEN_MAX;
2394 		nf = (struct files *) realloc(files, nnf * sizeof(*nf));
2395 		if (nf == NULL)
2396 			FATAL("cannot grow files for %s and %zu files", s, nnf);
2397 		memset(&nf[nfiles], 0, FOPEN_MAX * sizeof(*nf));
2398 		nfiles = nnf;
2399 		files = nf;
2400 	}
2401 
2402 	fflush(stdout);	/* force a semblance of order */
2403 
2404 	/* don't try to read or write a directory */
2405 	if (a == LT || a == GT || a == APPEND)
2406 		if (stat(s, &sbuf) == 0 && S_ISDIR(sbuf.st_mode))
2407 				return NULL;
2408 
2409 	m = a;
2410 	if (a == GT) {
2411 		fp = fopen(s, "w");
2412 	} else if (a == APPEND) {
2413 		fp = fopen(s, "a");
2414 		m = GT;	/* so can mix > and >> */
2415 	} else if (a == '|') {	/* output pipe */
2416 		fp = popen(s, "w");
2417 	} else if (a == LE) {	/* input pipe */
2418 		fp = popen(s, "r");
2419 	} else if (a == LT) {	/* getline <file */
2420 		fp = strcmp(s, "-") == 0 ? stdin : fopen(s, "r");	/* "-" is stdin */
2421 	} else	/* can't happen */
2422 		FATAL("illegal redirection %d", a);
2423 	if (fp != NULL) {
2424 		files[i].fname = tostring(s);
2425 		files[i].fp = fp;
2426 		files[i].mode = m;
2427 		if (pnewflag)
2428 			*pnewflag = true;
2429 		if (fp != stdin && fp != stdout && fp != stderr)
2430 			(void) fcntl(fileno(fp), F_SETFD, FD_CLOEXEC);
2431 	}
2432 	return fp;
2433 }
2434 
filename(FILE * fp)2435 const char *filename(FILE *fp)
2436 {
2437 	size_t i;
2438 
2439 	for (i = 0; i < nfiles; i++)
2440 		if (fp == files[i].fp)
2441 			return files[i].fname;
2442 	return "???";
2443 }
2444 
closefile(Node ** a,int n)2445 Cell *closefile(Node **a, int n)
2446 {
2447  	Cell *x;
2448 	size_t i;
2449 	bool stat;
2450 
2451  	x = execute(a[0]);
2452  	getsval(x);
2453 	stat = true;
2454  	for (i = 0; i < nfiles; i++) {
2455 		if (!files[i].fname || strcmp(x->sval, files[i].fname) != 0)
2456 			continue;
2457 		if (files[i].mode == GT || files[i].mode == '|')
2458 			fflush(files[i].fp);
2459 		if (ferror(files[i].fp)) {
2460 			if ((files[i].mode == GT && files[i].fp != stderr)
2461 			  || files[i].mode == '|')
2462 				FATAL("write error on %s", files[i].fname);
2463 			else
2464 				WARNING("i/o error occurred on %s", files[i].fname);
2465 		}
2466 		if (files[i].fp == stdin || files[i].fp == stdout ||
2467 		    files[i].fp == stderr)
2468 			stat = freopen("/dev/null", "r+", files[i].fp) == NULL;
2469 		else if (files[i].mode == '|' || files[i].mode == LE)
2470 			stat = pclose(files[i].fp) == -1;
2471 		else
2472 			stat = fclose(files[i].fp) == EOF;
2473 		if (stat)
2474 			WARNING("i/o error occurred closing %s", files[i].fname);
2475 		xfree(files[i].fname);
2476 		files[i].fname = NULL;	/* watch out for ref thru this */
2477 		files[i].fp = NULL;
2478 		break;
2479  	}
2480  	tempfree(x);
2481  	x = gettemp();
2482 	setfval(x, (Awkfloat) (stat ? -1 : 0));
2483  	return(x);
2484 }
2485 
closeall(void)2486 void closeall(void)
2487 {
2488 	size_t i;
2489 	bool stat = false;
2490 
2491 	for (i = 0; i < nfiles; i++) {
2492 		if (! files[i].fp)
2493 			continue;
2494 		if (files[i].mode == GT || files[i].mode == '|')
2495 			fflush(files[i].fp);
2496 		if (ferror(files[i].fp)) {
2497 			if ((files[i].mode == GT && files[i].fp != stderr)
2498 			  || files[i].mode == '|')
2499 				FATAL("write error on %s", files[i].fname);
2500 			else
2501 				WARNING("i/o error occurred on %s", files[i].fname);
2502 		}
2503 		if (files[i].fp == stdin || files[i].fp == stdout ||
2504 		    files[i].fp == stderr)
2505 			continue;
2506 		if (files[i].mode == '|' || files[i].mode == LE)
2507 			stat = pclose(files[i].fp) == -1;
2508 		else
2509 			stat = fclose(files[i].fp) == EOF;
2510 		if (stat)
2511 			WARNING("i/o error occurred while closing %s", files[i].fname);
2512 	}
2513 }
2514 
flush_all(void)2515 static void flush_all(void)
2516 {
2517 	size_t i;
2518 
2519 	for (i = 0; i < nfiles; i++)
2520 		if (files[i].fp)
2521 			fflush(files[i].fp);
2522 }
2523 
2524 void backsub(char **pb_ptr, const char **sptr_ptr);
2525 
dosub(Node ** a,int subop)2526 Cell *dosub(Node **a, int subop)        /* sub and gsub */
2527 {
2528 	fa *pfa;
2529 	int tempstat = 0;
2530 	char *repl;
2531 	Cell *x;
2532 
2533 	char *buf = NULL;
2534 	char *pb = NULL;
2535 	int bufsz = recsize;
2536 
2537 	const char *r, *s;
2538 	const char *start;
2539 	const char *noempty = NULL;      /* empty match disallowed here */
2540 	size_t m = 0;                    /* match count */
2541 	size_t whichm = 0;               /* which match to select, 0 = global */
2542 	int mtype;                       /* match type */
2543 
2544 	if (a[0] == NULL) {	/* 0 => a[1] is already-compiled regexpr */
2545 		pfa = (fa *) a[1];
2546 	} else {
2547 		x = execute(a[1]);
2548 		pfa = makedfa(getsval(x), 1);
2549 		tempfree(x);
2550 	}
2551 
2552 	x = execute(a[2]);	/* replacement string */
2553 	repl = tostring(getsval(x));
2554 	tempfree(x);
2555 
2556 	switch (subop) {
2557 	case SUB:
2558 		whichm = 1;
2559 		x = execute(a[3]);    /* source string */
2560 		break;
2561 	case GSUB:
2562 		whichm = 0;
2563 		x = execute(a[3]);    /* source string */
2564 		break;
2565 	default:
2566 		FATAL("dosub: unrecognized subop: %d", subop);
2567 	}
2568 
2569 	start = getsval(x);
2570 	while (pmatch(pfa, start)) {
2571 		if (buf == NULL) {
2572 			if ((pb = buf = (char *) malloc(bufsz)) == NULL)
2573 				FATAL("out of memory in dosub");
2574 			tempstat = pfa->initstat;
2575 			pfa->initstat = 2;
2576 		}
2577 
2578 		/* match types */
2579 		#define	MT_IGNORE  0  /* unselected or invalid */
2580 		#define MT_INSERT  1  /* selected, empty */
2581 		#define MT_REPLACE 2  /* selected, not empty */
2582 
2583 		/* an empty match just after replacement is invalid */
2584 
2585 		if (patbeg == noempty && patlen == 0) {
2586 			mtype = MT_IGNORE;    /* invalid, not counted */
2587 		} else if (whichm == ++m || whichm == 0) {
2588 			mtype = patlen ? MT_REPLACE : MT_INSERT;
2589 		} else {
2590 			mtype = MT_IGNORE;    /* unselected, but counted */
2591 		}
2592 
2593 		/* leading text: */
2594 		if (patbeg > start) {
2595 			adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),
2596 				recsize, &pb, "dosub");
2597 			s = start;
2598 			while (s < patbeg)
2599 				*pb++ = *s++;
2600 		}
2601 
2602 		if (mtype == MT_IGNORE)
2603 			goto matching_text;  /* skip replacement text */
2604 
2605 		r = repl;
2606 		while (*r != 0) {
2607 			adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");
2608 			if (*r == '\\') {
2609 				backsub(&pb, &r);
2610 			} else if (*r == '&') {
2611 				r++;
2612 				adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,
2613 					&pb, "dosub");
2614 				for (s = patbeg; s < patbeg+patlen; )
2615 					*pb++ = *s++;
2616 			} else {
2617 				*pb++ = *r++;
2618 			}
2619 		}
2620 
2621 matching_text:
2622 		if (mtype == MT_REPLACE || *patbeg == '\0')
2623 			goto next_search;  /* skip matching text */
2624 
2625 		if (patlen == 0)
2626 			patlen = u8_nextlen(patbeg);
2627 		adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");
2628 		s = patbeg;
2629 		while (s < patbeg + patlen)
2630 			*pb++ = *s++;
2631 
2632 next_search:
2633 		start = patbeg + patlen;
2634 		if (m == whichm || *patbeg == '\0')
2635 			break;
2636 		if (mtype == MT_REPLACE)
2637 			noempty = start;
2638 
2639 		#undef MT_IGNORE
2640 		#undef MT_INSERT
2641 		#undef MT_REPLACE
2642 	}
2643 
2644 	xfree(repl);
2645 
2646 	if (buf != NULL) {
2647 		pfa->initstat = tempstat;
2648 
2649 		/* trailing text */
2650 		adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");
2651 		while ((*pb++ = *start++) != '\0')
2652 			;
2653 
2654 		setsval(x, buf);
2655 		free(buf);
2656 	}
2657 
2658 	tempfree(x);
2659 	x = gettemp();
2660 	x->tval = NUM;
2661 	x->fval = m;
2662 	return x;
2663 }
2664 
gensub(Node ** a,int nnn)2665 Cell *gensub(Node **a, int nnn)	/* global selective substitute */
2666 	/* XXX incomplete - doesn't support backreferences \0 ... \9 */
2667 {
2668 	Cell *x, *y, *res, *h;
2669 	char *rptr;
2670 	const char *sptr;
2671 	char *buf, *pb;
2672 	const char *t, *q;
2673 	fa *pfa;
2674 	int mflag, tempstat, num, whichm;
2675 	int bufsz = recsize;
2676 
2677 	if ((buf = malloc(bufsz)) == NULL)
2678 		FATAL("out of memory in gensub");
2679 	mflag = 0;	/* if mflag == 0, can replace empty string */
2680 	num = 0;
2681 	x = execute(a[4]);	/* source string */
2682 	t = getsval(x);
2683 	res = copycell(x);	/* target string - initially copy of source */
2684 	res->csub = CTEMP;	/* result values are temporary */
2685 	if (a[0] == 0)		/* 0 => a[1] is already-compiled regexpr */
2686 		pfa = (fa *) a[1];	/* regular expression */
2687 	else {
2688 		y = execute(a[1]);
2689 		pfa = makedfa(getsval(y), 1);
2690 		tempfree(y);
2691 	}
2692 	y = execute(a[2]);	/* replacement string */
2693 	h = execute(a[3]);	/* which matches should be replaced */
2694 	sptr = getsval(h);
2695 	if (sptr[0] == 'g' || sptr[0] == 'G')
2696 		whichm = -1;
2697 	else {
2698 		/*
2699 		 * The specified number is index of replacement, starting
2700 		 * from 1. GNU awk treats index lower than 0 same as
2701 		 * 1, we do same for compatibility.
2702 		 */
2703 		whichm = (int) getfval(h) - 1;
2704 		if (whichm < 0)
2705 			whichm = 0;
2706 	}
2707 	tempfree(h);
2708 
2709 	if (pmatch(pfa, t)) {
2710 		char *sl;
2711 
2712 		tempstat = pfa->initstat;
2713 		pfa->initstat = 2;
2714 		pb = buf;
2715 		rptr = getsval(y);
2716 		/*
2717 		 * XXX if there are any backreferences in subst string,
2718 		 * complain now.
2719 		 */
2720 		for (sl = rptr; (sl = strchr(sl, '\\')) && sl[1]; sl++) {
2721 			if (strchr("0123456789", sl[1])) {
2722 				FATAL("gensub doesn't support backreferences (subst \"%s\")", rptr);
2723 			}
2724 		}
2725 
2726 		do {
2727 			if (whichm >= 0 && whichm != num) {
2728 				num++;
2729 				adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - t) + patlen, recsize, &pb, "gensub");
2730 
2731 				/* copy the part of string up to and including
2732 				 * match to output buffer */
2733 				while (t < patbeg + patlen)
2734 					*pb++ = *t++;
2735 				continue;
2736 			}
2737 
2738 			if (patlen == 0 && *patbeg != 0) {	/* matched empty string */
2739 				if (mflag == 0) {	/* can replace empty */
2740 					num++;
2741 					sptr = rptr;
2742 					while (*sptr != 0) {
2743 						adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2744 						if (*sptr == '\\') {
2745 							backsub(&pb, &sptr);
2746 						} else if (*sptr == '&') {
2747 							sptr++;
2748 							adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2749 							for (q = patbeg; q < patbeg+patlen; )
2750 								*pb++ = *q++;
2751 						} else
2752 							*pb++ = *sptr++;
2753 					}
2754 				}
2755 				if (*t == 0)	/* at end */
2756 					goto done;
2757 				adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gensub");
2758 				*pb++ = *t++;
2759 				if (pb > buf + bufsz)	/* BUG: not sure of this test */
2760 					FATAL("gensub result0 %.30s too big; can't happen", buf);
2761 				mflag = 0;
2762 			}
2763 			else {	/* matched nonempty string */
2764 				num++;
2765 				sptr = t;
2766 				adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gensub");
2767 				while (sptr < patbeg)
2768 					*pb++ = *sptr++;
2769 				sptr = rptr;
2770 				while (*sptr != 0) {
2771 					adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2772 					if (*sptr == '\\') {
2773 						backsub(&pb, &sptr);
2774 					} else if (*sptr == '&') {
2775 						sptr++;
2776 						adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2777 						for (q = patbeg; q < patbeg+patlen; )
2778 							*pb++ = *q++;
2779 					} else
2780 						*pb++ = *sptr++;
2781 				}
2782 				t = patbeg + patlen;
2783 				if (patlen == 0 || *t == 0 || *(t-1) == 0)
2784 					goto done;
2785 				if (pb > buf + bufsz)
2786 					FATAL("gensub result1 %.30s too big; can't happen", buf);
2787 				mflag = 1;
2788 			}
2789 		} while (pmatch(pfa,t));
2790 		sptr = t;
2791 		adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gensub");
2792 		while ((*pb++ = *sptr++) != 0)
2793 			;
2794 	done:	if (pb > buf + bufsz)
2795 			FATAL("gensub result2 %.30s too big; can't happen", buf);
2796 		*pb = '\0';
2797 		setsval(res, buf);
2798 		pfa->initstat = tempstat;
2799 	}
2800 	tempfree(x);
2801 	tempfree(y);
2802 	free(buf);
2803 	return(res);
2804 }
2805 
backsub(char ** pb_ptr,const char ** sptr_ptr)2806 void backsub(char **pb_ptr, const char **sptr_ptr)	/* handle \\& variations */
2807 {						/* sptr[0] == '\\' */
2808 	char *pb = *pb_ptr;
2809 	const char *sptr = *sptr_ptr;
2810 	static bool first = true;
2811 	static bool do_posix = false;
2812 
2813 	if (first) {
2814 		first = false;
2815 		do_posix = (getenv("POSIXLY_CORRECT") != NULL);
2816 	}
2817 
2818 	if (sptr[1] == '\\') {
2819 		if (sptr[2] == '\\' && sptr[3] == '&') { /* \\\& -> \& */
2820 			*pb++ = '\\';
2821 			*pb++ = '&';
2822 			sptr += 4;
2823 		} else if (sptr[2] == '&') {	/* \\& -> \ + matched */
2824 			*pb++ = '\\';
2825 			sptr += 2;
2826 		} else if (do_posix) {		/* \\x -> \x */
2827 			sptr++;
2828 			*pb++ = *sptr++;
2829 		} else {			/* \\x -> \\x */
2830 			*pb++ = *sptr++;
2831 			*pb++ = *sptr++;
2832 		}
2833 	} else if (sptr[1] == '&') {	/* literal & */
2834 		sptr++;
2835 		*pb++ = *sptr++;
2836 	} else				/* literal \ */
2837 		*pb++ = *sptr++;
2838 
2839 	*pb_ptr = pb;
2840 	*sptr_ptr = sptr;
2841 }
2842 
wide_char_to_byte_str(int rune,size_t * outlen)2843 static char *wide_char_to_byte_str(int rune, size_t *outlen)
2844 {
2845 	static char buf[5];
2846 	int len;
2847 
2848 	if (rune < 0 || rune > 0x10FFFF)
2849 		return NULL;
2850 
2851 	memset(buf, 0, sizeof(buf));
2852 
2853 	len = 0;
2854 	if (rune <= 0x0000007F) {
2855 		buf[len++] = rune;
2856 	} else if (rune <= 0x000007FF) {
2857 		// 110xxxxx 10xxxxxx
2858 		buf[len++] = 0xC0 | (rune >> 6);
2859 		buf[len++] = 0x80 | (rune & 0x3F);
2860 	} else if (rune <= 0x0000FFFF) {
2861 		// 1110xxxx 10xxxxxx 10xxxxxx
2862 		buf[len++] = 0xE0 | (rune >> 12);
2863 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2864 		buf[len++] = 0x80 | (rune & 0x3F);
2865 
2866 	} else {
2867 		// 0x00010000 - 0x10FFFF
2868 		// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2869 		buf[len++] = 0xF0 | (rune >> 18);
2870 		buf[len++] = 0x80 | ((rune >> 12) & 0x3F);
2871 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2872 		buf[len++] = 0x80 | (rune & 0x3F);
2873 	}
2874 
2875 	*outlen = len;
2876 	buf[len++] = '\0';
2877 
2878 	return buf;
2879 }
2880