xref: /illumos-gate/usr/src/cmd/awk/lex.c (revision 12042ab213b3af68474f48555504db816a449211)
1 /*
2  * Copyright (C) Lucent Technologies 1997
3  * All Rights Reserved
4  *
5  * Permission to use, copy, modify, and distribute this software and
6  * its documentation for any purpose and without fee is hereby
7  * granted, provided that the above copyright notice appear in all
8  * copies and that both that the copyright notice and this
9  * permission notice and warranty disclaimer appear in supporting
10  * documentation, and that the name Lucent Technologies or any of
11  * its entities not be used in advertising or publicity pertaining
12  * to distribution of the software without specific, written prior
13  * permission.
14  *
15  * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16  * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17  * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21  * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22  * THIS SOFTWARE.
23  */
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "y.tab.h"
31 
32 extern YYSTYPE	yylval;
33 extern int	infunc;
34 
35 off_t	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 Keyword keywords[] = {	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "atan2",	FATAN,		BLTIN },
51 	{ "break",	BREAK,		BREAK },
52 	{ "close",	CLOSE,		CLOSE },
53 	{ "continue",	CONTINUE,	CONTINUE },
54 	{ "cos",	FCOS,		BLTIN },
55 	{ "delete",	DELETE,		DELETE },
56 	{ "do",		DO,		DO },
57 	{ "else",	ELSE,		ELSE },
58 	{ "exit",	EXIT,		EXIT },
59 	{ "exp",	FEXP,		BLTIN },
60 	{ "fflush",	FFLUSH,		BLTIN },
61 	{ "for",	FOR,		FOR },
62 	{ "func",	FUNC,		FUNC },
63 	{ "function",	FUNC,		FUNC },
64 	{ "getline",	GETLINE,	GETLINE },
65 	{ "gsub",	GSUB,		GSUB },
66 	{ "if",		IF,		IF },
67 	{ "in",		IN,		IN },
68 	{ "index",	INDEX,		INDEX },
69 	{ "int",	FINT,		BLTIN },
70 	{ "length",	FLENGTH,	BLTIN },
71 	{ "log",	FLOG,		BLTIN },
72 	{ "match",	MATCHFCN,	MATCHFCN },
73 	{ "next",	NEXT,		NEXT },
74 	{ "nextfile",	NEXTFILE,	NEXTFILE },
75 	{ "print",	PRINT,		PRINT },
76 	{ "printf",	PRINTF,		PRINTF },
77 	{ "rand",	FRAND,		BLTIN },
78 	{ "return",	RETURN,		RETURN },
79 	{ "sin",	FSIN,		BLTIN },
80 	{ "split",	SPLIT,		SPLIT },
81 	{ "sprintf",	SPRINTF,	SPRINTF },
82 	{ "sqrt",	FSQRT,		BLTIN },
83 	{ "srand",	FSRAND,		BLTIN },
84 	{ "sub",	SUB,		SUB },
85 	{ "substr",	SUBSTR,		SUBSTR },
86 	{ "system",	FSYSTEM,	BLTIN },
87 	{ "tolower",	FTOLOWER,	BLTIN },
88 	{ "toupper",	FTOUPPER,	BLTIN },
89 	{ "while",	WHILE,		WHILE },
90 };
91 
92 #define	RET(x)	{ if (dbg) (void) printf("lex %s\n", tokname(x)); return (x); }
93 
94 int
95 peek(void)
96 {
97 	int c = input();
98 	unput(c);
99 	return (c);
100 }
101 
102 int
103 gettok(char **pbuf, size_t *psz)	/* get next input token */
104 {
105 	int c, retc;
106 	char *buf = *pbuf;
107 	size_t sz = *psz;
108 	char *bp = buf;
109 
110 	c = input();
111 	if (c == 0)
112 		return (0);
113 	buf[0] = c;
114 	buf[1] = 0;
115 	if (!isalnum(c) && c != '.' && c != '_')
116 		return (c);
117 
118 	*bp++ = c;
119 	if (isalpha(c) || c == '_') {	/* it's a varname */
120 		for (; (c = input()) != 0; ) {
121 			if (bp-buf >= sz &&
122 			    !adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
123 				FATAL("out of space for name %.10s...", buf);
124 			if (isalnum(c) || c == '_')
125 				*bp++ = c;
126 			else {
127 				*bp = 0;
128 				unput(c);
129 				break;
130 			}
131 		}
132 		*bp = 0;
133 		retc = 'a';	/* alphanumeric */
134 	} else {	/* maybe it's a number, but could be . */
135 		char *rem;
136 		/* read input until can't be a number */
137 		for (; (c = input()) != 0; ) {
138 			if (bp-buf >= sz &&
139 			    !adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
140 				FATAL("out of space for number %.10s...", buf);
141 			if (isdigit(c) || c == 'e' || c == 'E' ||
142 			    c == '.' || c == '+' || c == '-')
143 				*bp++ = c;
144 			else {
145 				unput(c);
146 				break;
147 			}
148 		}
149 		*bp = 0;
150 		(void) strtod(buf, &rem);	/* parse the number */
151 		if (rem == buf) {	/* it wasn't a valid number at all */
152 			buf[1] = 0;	/* return one character as token */
153 			retc = buf[0];	/* character is its own type */
154 			unputstr(rem+1); /* put rest back for later */
155 		} else {	/* some prefix was a number */
156 			unputstr(rem);	/* put rest back for later */
157 			rem[0] = 0;	/* truncate buf after number part */
158 			retc = '0';	/* type is number */
159 		}
160 	}
161 	*pbuf = buf;
162 	*psz = sz;
163 	return (retc);
164 }
165 
166 int	word(char *);
167 int	string(void);
168 int	regexpr(void);
169 int	sc	= 0;	/* 1 => return a } right now */
170 int	reg	= 0;	/* 1 => return a REGEXPR now */
171 
172 int
173 yylex(void)
174 {
175 	int c;
176 	static char *buf = NULL;
177 	/* BUG: setting this small causes core dump! */
178 	static size_t bufsize = 5;
179 
180 	if (buf == NULL && (buf = (char *)malloc(bufsize)) == NULL)
181 		FATAL("out of space in yylex");
182 	if (sc) {
183 		sc = 0;
184 		RET('}');
185 	}
186 	if (reg) {
187 		reg = 0;
188 		return (regexpr());
189 	}
190 	for (;;) {
191 		c = gettok(&buf, &bufsize);
192 		if (c == 0)
193 			return (0);
194 		if (isalpha(c) || c == '_')
195 			return (word(buf));
196 		if (isdigit(c)) {
197 			yylval.cp = setsymtab(
198 			    buf, tostring(buf), atof(buf), CON|NUM, symtab);
199 			/* should this also have STR set? */
200 			RET(NUMBER);
201 		}
202 
203 		yylval.i = c;
204 		switch (c) {
205 		case '\n':	/* {EOL} */
206 			lineno++;
207 			RET(NL);
208 		case '\r':	/* assume \n is coming */
209 		case ' ':	/* {WS}+ */
210 		case '\t':
211 			break;
212 		case '#':	/* #.* strip comments */
213 			while ((c = input()) != '\n' && c != 0)
214 				;
215 			unput(c);
216 			break;
217 		case ';':
218 			RET(';');
219 		case '\\':
220 			if (peek() == '\n') {
221 				(void) input();
222 				lineno++;
223 			} else if (peek() == '\r') {
224 				(void) input();
225 				(void) input();	/* BUG: check for \n */
226 				lineno++;
227 			} else {
228 				RET(c);
229 			}
230 			break;
231 		case '&':
232 			if (peek() == '&') {
233 				(void) input();
234 				RET(AND);
235 			} else
236 				RET('&');
237 		case '|':
238 			if (peek() == '|') {
239 				(void) input();
240 				RET(BOR);
241 			} else
242 				RET('|');
243 		case '!':
244 			if (peek() == '=') {
245 				(void) input();
246 				yylval.i = NE;
247 				RET(NE);
248 			} else if (peek() == '~') {
249 				(void) input();
250 				yylval.i = NOTMATCH;
251 				RET(MATCHOP);
252 			} else
253 				RET(NOT);
254 		case '~':
255 			yylval.i = MATCH;
256 			RET(MATCHOP);
257 		case '<':
258 			if (peek() == '=') {
259 				(void) input();
260 				yylval.i = LE;
261 				RET(LE);
262 			} else {
263 				yylval.i = LT;
264 				RET(LT);
265 			}
266 		case '=':
267 			if (peek() == '=') {
268 				(void) input();
269 				yylval.i = EQ;
270 				RET(EQ);
271 			} else {
272 				yylval.i = ASSIGN;
273 				RET(ASGNOP);
274 			}
275 		case '>':
276 			if (peek() == '=') {
277 				(void) input();
278 				yylval.i = GE;
279 				RET(GE);
280 			} else if (peek() == '>') {
281 				(void) input();
282 				yylval.i = APPEND;
283 				RET(APPEND);
284 			} else {
285 				yylval.i = GT;
286 				RET(GT);
287 			}
288 		case '+':
289 			if (peek() == '+') {
290 				(void) input();
291 				yylval.i = INCR;
292 				RET(INCR);
293 			} else if (peek() == '=') {
294 				(void) input();
295 				yylval.i = ADDEQ;
296 				RET(ASGNOP);
297 			} else
298 				RET('+');
299 		case '-':
300 			if (peek() == '-') {
301 				(void) input();
302 				yylval.i = DECR;
303 				RET(DECR);
304 			} else if (peek() == '=') {
305 				(void) input();
306 				yylval.i = SUBEQ;
307 				RET(ASGNOP);
308 			} else
309 				RET('-');
310 		case '*':
311 			if (peek() == '=') {	/* *= */
312 				(void) input();
313 				yylval.i = MULTEQ;
314 				RET(ASGNOP);
315 			} else if (peek() == '*') {	/* ** or **= */
316 				(void) input();	/* eat 2nd * */
317 				if (peek() == '=') {
318 					(void) input();
319 					yylval.i = POWEQ;
320 					RET(ASGNOP);
321 				} else {
322 					RET(POWER);
323 				}
324 			} else
325 				RET('*');
326 		case '/':
327 			RET('/');
328 		case '%':
329 			if (peek() == '=') {
330 				(void) input();
331 				yylval.i = MODEQ;
332 				RET(ASGNOP);
333 			} else
334 				RET('%');
335 		case '^':
336 			if (peek() == '=') {
337 				(void) input();
338 				yylval.i = POWEQ;
339 				RET(ASGNOP);
340 			} else
341 				RET(POWER);
342 
343 		case '$':
344 			/* BUG: awkward, if not wrong */
345 			c = gettok(&buf, &bufsize);
346 			if (isalpha(c)) {
347 				if (strcmp(buf, "NF") == 0) {
348 					/* very special */
349 					unputstr("(NF)");
350 					RET(INDIRECT);
351 				}
352 				c = peek();
353 				if (c == '(' || c == '[' ||
354 				    (infunc && isarg(buf) >= 0)) {
355 					unputstr(buf);
356 					RET(INDIRECT);
357 				}
358 				yylval.cp = setsymtab(
359 				    buf, "", 0.0, STR|NUM, symtab);
360 				RET(IVAR);
361 			} else if (c == 0) {	/*  */
362 				SYNTAX("unexpected end of input after $");
363 				RET(';');
364 			} else {
365 				unputstr(buf);
366 				RET(INDIRECT);
367 			}
368 
369 		case '}':
370 			if (--bracecnt < 0)
371 				SYNTAX("extra }");
372 			sc = 1;
373 			RET(';');
374 		case ']':
375 			if (--brackcnt < 0)
376 				SYNTAX("extra ]");
377 			RET(']');
378 		case ')':
379 			if (--parencnt < 0)
380 				SYNTAX("extra )");
381 			RET(')');
382 		case '{':
383 			bracecnt++;
384 			RET('{');
385 		case '[':
386 			brackcnt++;
387 			RET('[');
388 		case '(':
389 			parencnt++;
390 			RET('(');
391 
392 		case '"':
393 			/* BUG: should be like tran.c ? */
394 			return (string());
395 
396 		default:
397 			RET(c);
398 		}
399 	}
400 }
401 
402 int
403 string(void)
404 {
405 	int c, n;
406 	char *s, *bp;
407 	static char *buf = NULL;
408 	static size_t bufsz = 500;
409 
410 	if (buf == NULL && (buf = (char *)malloc(bufsz)) == NULL)
411 		FATAL("out of space for strings");
412 	for (bp = buf; (c = input()) != '"'; ) {
413 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
414 			FATAL("out of space for string %.10s...", buf);
415 		switch (c) {
416 		case '\n':
417 		case '\r':
418 		case 0:
419 			*bp = '\0';
420 			SYNTAX("non-terminated string %.10s...", buf);
421 			if (c == 0)	/* hopeless */
422 				FATAL("giving up");
423 			lineno++;
424 			break;
425 		case '\\':
426 			c = input();
427 			switch (c) {
428 			case '"': *bp++ = '"'; break;
429 			case 'n': *bp++ = '\n'; break;
430 			case 't': *bp++ = '\t'; break;
431 			case 'f': *bp++ = '\f'; break;
432 			case 'r': *bp++ = '\r'; break;
433 			case 'b': *bp++ = '\b'; break;
434 			case 'v': *bp++ = '\v'; break;
435 			case 'a': *bp++ = '\007'; break;
436 			case '\\': *bp++ = '\\'; break;
437 
438 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
439 			case '3': case '4': case '5': case '6': case '7':
440 				n = c - '0';
441 				if ((c = peek()) >= '0' && c < '8') {
442 					n = 8 * n + input() - '0';
443 					if ((c = peek()) >= '0' && c < '8')
444 						n = 8 * n + input() - '0';
445 				}
446 				*bp++ = n;
447 				break;
448 
449 			case 'x': {	/* hex  \x0-9a-fA-F + */
450 				char xbuf[100], *px;
451 				px = xbuf;
452 				while ((c = input()) != 0 && px-xbuf < 100-2) {
453 					if (isdigit(c) ||
454 					    (c >= 'a' && c <= 'f') ||
455 					    (c >= 'A' && c <= 'F'))
456 						*px++ = c;
457 					else
458 						break;
459 				}
460 				*px = 0;
461 				unput(c);
462 				(void) sscanf(xbuf, "%x", (unsigned int *)&n);
463 				*bp++ = n;
464 				break;
465 			}
466 
467 			default:
468 				*bp++ = c;
469 				break;
470 			}
471 			break;
472 		default:
473 			*bp++ = c;
474 			break;
475 		}
476 	}
477 	*bp = 0;
478 	s = tostring(buf);
479 	*bp++ = ' '; *bp++ = 0;
480 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
481 	RET(STRING);
482 }
483 
484 
485 int
486 binsearch(char *w, Keyword *kp, int n)
487 {
488 	int cond, low, mid, high;
489 
490 	low = 0;
491 	high = n - 1;
492 	while (low <= high) {
493 		mid = (low + high) / 2;
494 		if ((cond = strcmp(w, kp[mid].word)) < 0)
495 			high = mid - 1;
496 		else if (cond > 0)
497 			low = mid + 1;
498 		else
499 			return (mid);
500 	}
501 	return (-1);
502 }
503 
504 int
505 word(char *w)
506 {
507 	Keyword *kp;
508 	int c, n;
509 
510 	n = binsearch(w, keywords, sizeof (keywords) / sizeof (keywords[0]));
511 	if (n != -1) {	/* found in table */
512 		kp = keywords + n;
513 		yylval.i = kp->sub;
514 		switch (kp->type) {	/* special handling */
515 		case BLTIN:
516 			if (kp->sub == FSYSTEM && safe)
517 				SYNTAX("system is unsafe");
518 			RET(kp->type);
519 		case FUNC:
520 			if (infunc)
521 				SYNTAX("illegal nested function");
522 			RET(kp->type);
523 		case RETURN:
524 			if (!infunc)
525 				SYNTAX("return not in function");
526 			RET(kp->type);
527 		case VARNF:
528 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
529 			RET(VARNF);
530 		default:
531 			RET(kp->type);
532 		}
533 	}
534 	c = peek();	/* look for '(' */
535 	if (c != '(' && infunc && (n = isarg(w)) >= 0) {
536 		yylval.i = n;
537 		RET(ARG);
538 	} else {
539 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
540 		if (c == '(') {
541 			RET(CALL);
542 		} else {
543 			RET(VAR);
544 		}
545 	}
546 }
547 
548 void
549 startreg(void)	/* next call to yylex will return a regular expression */
550 {
551 	reg = 1;
552 }
553 
554 int
555 regexpr(void)
556 {
557 	int c;
558 	static char *buf = NULL;
559 	static size_t bufsz = 500;
560 	char *bp;
561 
562 	if (buf == NULL && (buf = (char *)malloc(bufsz)) == NULL)
563 		FATAL("out of space for rex expr");
564 	bp = buf;
565 	for (; (c = input()) != '/' && c != 0; ) {
566 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
567 			FATAL("out of space for reg expr %.10s...", buf);
568 		if (c == '\n') {
569 			*bp = '\0';
570 			SYNTAX("newline in regular expression %.10s...", buf);
571 			unput('\n');
572 			break;
573 		} else if (c == '\\') {
574 			*bp++ = '\\';
575 			*bp++ = input();
576 		} else {
577 			*bp++ = c;
578 		}
579 	}
580 	*bp = 0;
581 	if (c == 0)
582 		SYNTAX("non-terminated regular expression %.10s...", buf);
583 	yylval.s = tostring(buf);
584 	unput('/');
585 	RET(REGEXPR);
586 }
587 
588 /* low-level lexical stuff, sort of inherited from lex */
589 
590 char	ebuf[300];
591 char	*ep = ebuf;
592 char	yysbuf[100];	/* pushback buffer */
593 char	*yysptr = yysbuf;
594 FILE	*yyin = NULL;
595 
596 int
597 input(void)	/* get next lexical input character */
598 {
599 	int c;
600 	extern char *lexprog;
601 
602 	if (yysptr > yysbuf)
603 		c = (uschar)*--yysptr;
604 	else if (lexprog != NULL) {	/* awk '...' */
605 		if ((c = (uschar)*lexprog) != 0)
606 			lexprog++;
607 	} else				/* awk -f ... */
608 		c = pgetc();
609 	if (c == EOF)
610 		c = 0;
611 	if (ep >= ebuf + sizeof (ebuf))
612 		ep = ebuf;
613 	*ep = c;
614 	if (c != 0) {
615 		ep++;
616 	}
617 	return (c);
618 }
619 
620 void
621 unput(int c)	/* put lexical character back on input */
622 {
623 	if (yysptr >= yysbuf + sizeof (yysbuf))
624 		FATAL("pushed back too much: %.20s...", yysbuf);
625 	*yysptr++ = c;
626 	if (--ep < ebuf)
627 		ep = ebuf + sizeof (ebuf) - 1;
628 }
629 
630 void
631 unputstr(const char *s)	/* put a string back on input */
632 {
633 	int i;
634 
635 	for (i = strlen(s)-1; i >= 0; i--)
636 		unput(s[i]);
637 }
638