xref: /freebsd/contrib/one-true-awk/lex.c (revision 35c0a8c449fd2b7f75029ebed5e10852240f0865)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "awkgram.tab.h"
31 
32 extern YYSTYPE	yylval;
33 extern bool	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 const Keyword keywords[] = {	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "and",	FAND,		BLTIN },
51 	{ "atan2",	FATAN,		BLTIN },
52 	{ "break",	BREAK,		BREAK },
53 	{ "close",	CLOSE,		CLOSE },
54 	{ "compl",	FCOMPL,		BLTIN },
55 	{ "continue",	CONTINUE,	CONTINUE },
56 	{ "cos",	FCOS,		BLTIN },
57 	{ "delete",	DELETE,		DELETE },
58 	{ "do",		DO,		DO },
59 	{ "else",	ELSE,		ELSE },
60 	{ "exit",	EXIT,		EXIT },
61 	{ "exp",	FEXP,		BLTIN },
62 	{ "fflush",	FFLUSH,		BLTIN },
63 	{ "for",	FOR,		FOR },
64 	{ "func",	FUNC,		FUNC },
65 	{ "function",	FUNC,		FUNC },
66 	{ "gensub",	GENSUB,		GENSUB },
67 	{ "getline",	GETLINE,	GETLINE },
68 	{ "gsub",	GSUB,		GSUB },
69 	{ "if",		IF,		IF },
70 	{ "in",		IN,		IN },
71 	{ "index",	INDEX,		INDEX },
72 	{ "int",	FINT,		BLTIN },
73 	{ "length",	FLENGTH,	BLTIN },
74 	{ "log",	FLOG,		BLTIN },
75 	{ "lshift",	FLSHIFT,	BLTIN },
76 	{ "match",	MATCHFCN,	MATCHFCN },
77 	{ "mktime",	FMKTIME,	BLTIN },
78 	{ "next",	NEXT,		NEXT },
79 	{ "nextfile",	NEXTFILE,	NEXTFILE },
80 	{ "or",		FFOR,		BLTIN },
81 	{ "print",	PRINT,		PRINT },
82 	{ "printf",	PRINTF,		PRINTF },
83 	{ "rand",	FRAND,		BLTIN },
84 	{ "return",	RETURN,		RETURN },
85 	{ "rshift",	FRSHIFT,	BLTIN },
86 	{ "sin",	FSIN,		BLTIN },
87 	{ "split",	SPLIT,		SPLIT },
88 	{ "sprintf",	SPRINTF,	SPRINTF },
89 	{ "sqrt",	FSQRT,		BLTIN },
90 	{ "srand",	FSRAND,		BLTIN },
91 	{ "strftime",	FSTRFTIME,	BLTIN },
92 	{ "sub",	SUB,		SUB },
93 	{ "substr",	SUBSTR,		SUBSTR },
94 	{ "system",	FSYSTEM,	BLTIN },
95 	{ "systime",	FSYSTIME,	BLTIN },
96 	{ "tolower",	FTOLOWER,	BLTIN },
97 	{ "toupper",	FTOUPPER,	BLTIN },
98 	{ "while",	WHILE,		WHILE },
99 	{ "xor",	FXOR,		BLTIN },
100 };
101 
102 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
103 
104 static int peek(void)
105 {
106 	int c = input();
107 	unput(c);
108 	return c;
109 }
110 
111 static int gettok(char **pbuf, int *psz)	/* get next input token */
112 {
113 	int c, retc;
114 	char *buf = *pbuf;
115 	int sz = *psz;
116 	char *bp = buf;
117 
118 	c = input();
119 	if (c == 0)
120 		return 0;
121 	buf[0] = c;
122 	buf[1] = 0;
123 	if (!isalnum(c) && c != '.' && c != '_')
124 		return c;
125 
126 	*bp++ = c;
127 	if (isalpha(c) || c == '_') {	/* it's a varname */
128 		for ( ; (c = input()) != 0; ) {
129 			if (bp-buf >= sz)
130 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
131 					FATAL( "out of space for name %.10s...", buf );
132 			if (isalnum(c) || c == '_')
133 				*bp++ = c;
134 			else {
135 				*bp = 0;
136 				unput(c);
137 				break;
138 			}
139 		}
140 		*bp = 0;
141 		retc = 'a';	/* alphanumeric */
142 	} else {	/* maybe it's a number, but could be . */
143 		char *rem;
144 		/* read input until can't be a number */
145 		for ( ; (c = input()) != 0; ) {
146 			if (bp-buf >= sz)
147 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
148 					FATAL( "out of space for number %.10s...", buf );
149 			if (isdigit(c) || c == 'e' || c == 'E'
150 			  || c == '.' || c == '+' || c == '-')
151 				*bp++ = c;
152 			else {
153 				unput(c);
154 				break;
155 			}
156 		}
157 		*bp = 0;
158 		strtod(buf, &rem);	/* parse the number */
159 		if (rem == buf) {	/* it wasn't a valid number at all */
160 			buf[1] = 0;	/* return one character as token */
161 			retc = (uschar)buf[0];	/* character is its own type */
162 			unputstr(rem+1); /* put rest back for later */
163 		} else {	/* some prefix was a number */
164 			unputstr(rem);	/* put rest back for later */
165 			rem[0] = 0;	/* truncate buf after number part */
166 			retc = '0';	/* type is number */
167 		}
168 	}
169 	*pbuf = buf;
170 	*psz = sz;
171 	return retc;
172 }
173 
174 int	word(char *);
175 int	string(void);
176 int	regexpr(void);
177 bool	sc	= false;	/* true => return a } right now */
178 bool	reg	= false;	/* true => return a REGEXPR now */
179 
180 int yylex(void)
181 {
182 	int c;
183 	static char *buf = NULL;
184 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
185 
186 	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
187 		FATAL( "out of space in yylex" );
188 	if (sc) {
189 		sc = false;
190 		RET('}');
191 	}
192 	if (reg) {
193 		reg = false;
194 		return regexpr();
195 	}
196 	for (;;) {
197 		c = gettok(&buf, &bufsize);
198 		if (c == 0)
199 			return 0;
200 		if (isalpha(c) || c == '_')
201 			return word(buf);
202 		if (isdigit(c)) {
203 			char *cp = tostring(buf);
204 			double result;
205 
206 			if (is_number(cp, & result))
207 				yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
208 			else
209 				yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
210 			free(cp);
211 			/* should this also have STR set? */
212 			RET(NUMBER);
213 		}
214 
215 		yylval.i = c;
216 		switch (c) {
217 		case '\n':	/* {EOL} */
218 			lineno++;
219 			RET(NL);
220 		case '\r':	/* assume \n is coming */
221 		case ' ':	/* {WS}+ */
222 		case '\t':
223 			break;
224 		case '#':	/* #.* strip comments */
225 			while ((c = input()) != '\n' && c != 0)
226 				;
227 			unput(c);
228 			/*
229 			 * Next line is a hack, it compensates for
230 			 * unput's treatment of \n.
231 			 */
232 			lineno++;
233 			break;
234 		case ';':
235 			RET(';');
236 		case '\\':
237 			if (peek() == '\n') {
238 				input();
239 				lineno++;
240 			} else if (peek() == '\r') {
241 				input(); input();	/* \n */
242 				lineno++;
243 			} else {
244 				RET(c);
245 			}
246 			break;
247 		case '&':
248 			if (peek() == '&') {
249 				input(); RET(AND);
250 			} else
251 				RET('&');
252 		case '|':
253 			if (peek() == '|') {
254 				input(); RET(BOR);
255 			} else
256 				RET('|');
257 		case '!':
258 			if (peek() == '=') {
259 				input(); yylval.i = NE; RET(NE);
260 			} else if (peek() == '~') {
261 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
262 			} else
263 				RET(NOT);
264 		case '~':
265 			yylval.i = MATCH;
266 			RET(MATCHOP);
267 		case '<':
268 			if (peek() == '=') {
269 				input(); yylval.i = LE; RET(LE);
270 			} else {
271 				yylval.i = LT; RET(LT);
272 			}
273 		case '=':
274 			if (peek() == '=') {
275 				input(); yylval.i = EQ; RET(EQ);
276 			} else {
277 				yylval.i = ASSIGN; RET(ASGNOP);
278 			}
279 		case '>':
280 			if (peek() == '=') {
281 				input(); yylval.i = GE; RET(GE);
282 			} else if (peek() == '>') {
283 				input(); yylval.i = APPEND; RET(APPEND);
284 			} else {
285 				yylval.i = GT; RET(GT);
286 			}
287 		case '+':
288 			if (peek() == '+') {
289 				input(); yylval.i = INCR; RET(INCR);
290 			} else if (peek() == '=') {
291 				input(); yylval.i = ADDEQ; RET(ASGNOP);
292 			} else
293 				RET('+');
294 		case '-':
295 			if (peek() == '-') {
296 				input(); yylval.i = DECR; RET(DECR);
297 			} else if (peek() == '=') {
298 				input(); yylval.i = SUBEQ; RET(ASGNOP);
299 			} else
300 				RET('-');
301 		case '*':
302 			if (peek() == '=') {	/* *= */
303 				input(); yylval.i = MULTEQ; RET(ASGNOP);
304 			} else if (peek() == '*') {	/* ** or **= */
305 				input();	/* eat 2nd * */
306 				if (peek() == '=') {
307 					input(); yylval.i = POWEQ; RET(ASGNOP);
308 				} else {
309 					RET(POWER);
310 				}
311 			} else
312 				RET('*');
313 		case '/':
314 			RET('/');
315 		case '%':
316 			if (peek() == '=') {
317 				input(); yylval.i = MODEQ; RET(ASGNOP);
318 			} else
319 				RET('%');
320 		case '^':
321 			if (peek() == '=') {
322 				input(); yylval.i = POWEQ; RET(ASGNOP);
323 			} else
324 				RET(POWER);
325 
326 		case '$':
327 			/* BUG: awkward, if not wrong */
328 			c = gettok(&buf, &bufsize);
329 			if (isalpha(c)) {
330 				if (strcmp(buf, "NF") == 0) {	/* very special */
331 					unputstr("(NF)");
332 					RET(INDIRECT);
333 				}
334 				c = peek();
335 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
336 					unputstr(buf);
337 					RET(INDIRECT);
338 				}
339 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
340 				RET(IVAR);
341 			} else if (c == 0) {	/*  */
342 				SYNTAX( "unexpected end of input after $" );
343 				RET(';');
344 			} else {
345 				unputstr(buf);
346 				RET(INDIRECT);
347 			}
348 
349 		case '}':
350 			if (--bracecnt < 0)
351 				SYNTAX( "extra }" );
352 			sc = true;
353 			RET(';');
354 		case ']':
355 			if (--brackcnt < 0)
356 				SYNTAX( "extra ]" );
357 			RET(']');
358 		case ')':
359 			if (--parencnt < 0)
360 				SYNTAX( "extra )" );
361 			RET(')');
362 		case '{':
363 			bracecnt++;
364 			RET('{');
365 		case '[':
366 			brackcnt++;
367 			RET('[');
368 		case '(':
369 			parencnt++;
370 			RET('(');
371 
372 		case '"':
373 			return string();	/* BUG: should be like tran.c ? */
374 
375 		default:
376 			RET(c);
377 		}
378 	}
379 }
380 
381 extern int runetochar(char *str, int c);
382 
383 int string(void)
384 {
385 	int c, n;
386 	char *s, *bp;
387 	static char *buf = NULL;
388 	static int bufsz = 500;
389 
390 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
391 		FATAL("out of space for strings");
392 	for (bp = buf; (c = input()) != '"'; ) {
393 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
394 			FATAL("out of space for string %.10s...", buf);
395 		switch (c) {
396 		case '\n':
397 		case '\r':
398 		case 0:
399 			*bp = '\0';
400 			SYNTAX( "non-terminated string %.10s...", buf );
401 			if (c == 0)	/* hopeless */
402 				FATAL( "giving up" );
403 			lineno++;
404 			break;
405 		case '\\':
406 			c = input();
407 			switch (c) {
408 			case '\n': break;
409 			case '"': *bp++ = '"'; break;
410 			case 'n': *bp++ = '\n'; break;
411 			case 't': *bp++ = '\t'; break;
412 			case 'f': *bp++ = '\f'; break;
413 			case 'r': *bp++ = '\r'; break;
414 			case 'b': *bp++ = '\b'; break;
415 			case 'v': *bp++ = '\v'; break;
416 			case 'a': *bp++ = '\a'; break;
417 			case '\\': *bp++ = '\\'; break;
418 
419 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
420 			case '3': case '4': case '5': case '6': case '7':
421 				n = c - '0';
422 				if ((c = peek()) >= '0' && c < '8') {
423 					n = 8 * n + input() - '0';
424 					if ((c = peek()) >= '0' && c < '8')
425 						n = 8 * n + input() - '0';
426 				}
427 				*bp++ = n;
428 				break;
429 
430 			case 'x':	/* hex  \x0-9a-fA-F (exactly two) */
431 			    {
432 				int i;
433 
434 				if (!isxdigit(peek())) {
435 					unput(c);
436 					break;
437 				}
438 				n = 0;
439 				for (i = 0; i < 2; i++) {
440 					c = input();
441 					if (c == 0)
442 						break;
443 					if (isxdigit(c)) {
444 						c = tolower(c);
445 						n *= 16;
446 						if (isdigit(c))
447 							n += (c - '0');
448 						else
449 							n += 10 + (c - 'a');
450 					} else {
451 						unput(c);
452 						break;
453 					}
454 				}
455 				if (i)
456 					*bp++ = n;
457 				break;
458 			    }
459 
460 			case 'u':	/* utf  \u0-9a-fA-F (1..8) */
461 			    {
462 				int i;
463 
464 				n = 0;
465 				for (i = 0; i < 8; i++) {
466 					c = input();
467 					if (!isxdigit(c) || c == 0)
468 						break;
469 					c = tolower(c);
470 					n *= 16;
471 					if (isdigit(c))
472 						n += (c - '0');
473 					else
474 						n += 10 + (c - 'a');
475 				}
476 				unput(c);
477 				bp += runetochar(bp, n);
478 				break;
479 			    }
480 
481 			default:
482 				*bp++ = c;
483 				break;
484 			}
485 			break;
486 		default:
487 			*bp++ = c;
488 			break;
489 		}
490 	}
491 	*bp = 0;
492 	s = tostring(buf);
493 	*bp++ = ' '; *bp++ = '\0';
494 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
495 	free(s);
496 	RET(STRING);
497 }
498 
499 
500 static int binsearch(char *w, const Keyword *kp, int n)
501 {
502 	int cond, low, mid, high;
503 
504 	low = 0;
505 	high = n - 1;
506 	while (low <= high) {
507 		mid = (low + high) / 2;
508 		if ((cond = strcmp(w, kp[mid].word)) < 0)
509 			high = mid - 1;
510 		else if (cond > 0)
511 			low = mid + 1;
512 		else
513 			return mid;
514 	}
515 	return -1;
516 }
517 
518 int word(char *w)
519 {
520 	const Keyword *kp;
521 	int c, n;
522 
523 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
524 	if (n != -1) {	/* found in table */
525 		kp = keywords + n;
526 		yylval.i = kp->sub;
527 		switch (kp->type) {	/* special handling */
528 		case BLTIN:
529 			if (kp->sub == FSYSTEM && safe)
530 				SYNTAX( "system is unsafe" );
531 			RET(kp->type);
532 		case FUNC:
533 			if (infunc)
534 				SYNTAX( "illegal nested function" );
535 			RET(kp->type);
536 		case RETURN:
537 			if (!infunc)
538 				SYNTAX( "return not in function" );
539 			RET(kp->type);
540 		case VARNF:
541 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
542 			RET(VARNF);
543 		default:
544 			RET(kp->type);
545 		}
546 	}
547 	c = peek();	/* look for '(' */
548 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
549 		yylval.i = n;
550 		RET(ARG);
551 	} else {
552 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
553 		if (c == '(') {
554 			RET(CALL);
555 		} else {
556 			RET(VAR);
557 		}
558 	}
559 }
560 
561 void startreg(void)	/* next call to yylex will return a regular expression */
562 {
563 	reg = true;
564 }
565 
566 int regexpr(void)
567 {
568 	int c;
569 	static char *buf = NULL;
570 	static int bufsz = 500;
571 	char *bp;
572 
573 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
574 		FATAL("out of space for reg expr");
575 	bp = buf;
576 	for ( ; (c = input()) != '/' && c != 0; ) {
577 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
578 			FATAL("out of space for reg expr %.10s...", buf);
579 		if (c == '\n') {
580 			*bp = '\0';
581 			SYNTAX( "newline in regular expression %.10s...", buf );
582 			unput('\n');
583 			break;
584 		} else if (c == '\\') {
585 			*bp++ = '\\';
586 			*bp++ = input();
587 		} else {
588 			*bp++ = c;
589 		}
590 	}
591 	*bp = 0;
592 	if (c == 0)
593 		SYNTAX("non-terminated regular expression %.10s...", buf);
594 	yylval.s = tostring(buf);
595 	unput('/');
596 	RET(REGEXPR);
597 }
598 
599 /* low-level lexical stuff, sort of inherited from lex */
600 
601 char	ebuf[300];
602 char	*ep = ebuf;
603 char	yysbuf[100];	/* pushback buffer */
604 char	*yysptr = yysbuf;
605 FILE	*yyin = NULL;
606 
607 int input(void)	/* get next lexical input character */
608 {
609 	int c;
610 	extern char *lexprog;
611 
612 	if (yysptr > yysbuf)
613 		c = (uschar)*--yysptr;
614 	else if (lexprog != NULL) {	/* awk '...' */
615 		if ((c = (uschar)*lexprog) != 0)
616 			lexprog++;
617 	} else				/* awk -f ... */
618 		c = pgetc();
619 	if (c == EOF)
620 		c = 0;
621 	if (ep >= ebuf + sizeof ebuf)
622 		ep = ebuf;
623 	*ep = c;
624 	if (c != 0) {
625 		ep++;
626 	}
627 	return (c);
628 }
629 
630 void unput(int c)	/* put lexical character back on input */
631 {
632 	if (c == '\n')
633 		lineno--;
634 	if (yysptr >= yysbuf + sizeof(yysbuf))
635 		FATAL("pushed back too much: %.20s...", yysbuf);
636 	*yysptr++ = c;
637 	if (--ep < ebuf)
638 		ep = ebuf + sizeof(ebuf) - 1;
639 }
640 
641 void unputstr(const char *s)	/* put a string back on input */
642 {
643 	int i;
644 
645 	for (i = strlen(s)-1; i >= 0; i--)
646 		unput(s[i]);
647 }
648