xref: /freebsd/contrib/one-true-awk/lex.c (revision dd78d987cb38ef162d40aad86229f1dc19884f78)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "awkgram.tab.h"
31 
32 extern YYSTYPE	yylval;
33 extern bool	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 const Keyword keywords[] = {	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "and",	FAND,		BLTIN },
51 	{ "atan2",	FATAN,		BLTIN },
52 	{ "break",	BREAK,		BREAK },
53 	{ "close",	CLOSE,		CLOSE },
54 	{ "compl",	FCOMPL,		BLTIN },
55 	{ "continue",	CONTINUE,	CONTINUE },
56 	{ "cos",	FCOS,		BLTIN },
57 	{ "delete",	DELETE,		DELETE },
58 	{ "do",		DO,		DO },
59 	{ "else",	ELSE,		ELSE },
60 	{ "exit",	EXIT,		EXIT },
61 	{ "exp",	FEXP,		BLTIN },
62 	{ "fflush",	FFLUSH,		BLTIN },
63 	{ "for",	FOR,		FOR },
64 	{ "func",	FUNC,		FUNC },
65 	{ "function",	FUNC,		FUNC },
66 	{ "gensub",	GENSUB,		GENSUB },
67 	{ "getline",	GETLINE,	GETLINE },
68 	{ "gsub",	GSUB,		GSUB },
69 	{ "if",		IF,		IF },
70 	{ "in",		IN,		IN },
71 	{ "index",	INDEX,		INDEX },
72 	{ "int",	FINT,		BLTIN },
73 	{ "length",	FLENGTH,	BLTIN },
74 	{ "log",	FLOG,		BLTIN },
75 	{ "lshift",	FLSHIFT,	BLTIN },
76 	{ "match",	MATCHFCN,	MATCHFCN },
77 	{ "mktime",	FMKTIME,	BLTIN },
78 	{ "next",	NEXT,		NEXT },
79 	{ "nextfile",	NEXTFILE,	NEXTFILE },
80 	{ "or",		FFOR,		BLTIN },
81 	{ "print",	PRINT,		PRINT },
82 	{ "printf",	PRINTF,		PRINTF },
83 	{ "rand",	FRAND,		BLTIN },
84 	{ "return",	RETURN,		RETURN },
85 	{ "rshift",	FRSHIFT,	BLTIN },
86 	{ "sin",	FSIN,		BLTIN },
87 	{ "split",	SPLIT,		SPLIT },
88 	{ "sprintf",	SPRINTF,	SPRINTF },
89 	{ "sqrt",	FSQRT,		BLTIN },
90 	{ "srand",	FSRAND,		BLTIN },
91 	{ "strftime",	FSTRFTIME,	BLTIN },
92 	{ "sub",	SUB,		SUB },
93 	{ "substr",	SUBSTR,		SUBSTR },
94 	{ "system",	FSYSTEM,	BLTIN },
95 	{ "systime",	FSYSTIME,	BLTIN },
96 	{ "tolower",	FTOLOWER,	BLTIN },
97 	{ "toupper",	FTOUPPER,	BLTIN },
98 	{ "while",	WHILE,		WHILE },
99 	{ "xor",	FXOR,		BLTIN },
100 };
101 
102 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
103 
peek(void)104 static int peek(void)
105 {
106 	int c = input();
107 	unput(c);
108 	return c;
109 }
110 
gettok(char ** pbuf,int * psz)111 static int gettok(char **pbuf, int *psz)	/* get next input token */
112 {
113 	int c, retc;
114 	char *buf = *pbuf;
115 	int sz = *psz;
116 	char *bp = buf;
117 
118 	c = input();
119 	if (c == 0)
120 		return 0;
121 	buf[0] = c;
122 	buf[1] = 0;
123 	if (!isalnum(c) && c != '.' && c != '_')
124 		return c;
125 
126 	*bp++ = c;
127 	if (isalpha(c) || c == '_') {	/* it's a varname */
128 		for ( ; (c = input()) != 0; ) {
129 			if (bp-buf >= sz)
130 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
131 					FATAL( "out of space for name %.10s...", buf );
132 			if (isalnum(c) || c == '_')
133 				*bp++ = c;
134 			else {
135 				*bp = 0;
136 				unput(c);
137 				break;
138 			}
139 		}
140 		*bp = 0;
141 		retc = 'a';	/* alphanumeric */
142 	} else {	/* maybe it's a number, but could be . */
143 		char *rem;
144 		/* read input until can't be a number */
145 		for ( ; (c = input()) != 0; ) {
146 			if (bp-buf >= sz)
147 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
148 					FATAL( "out of space for number %.10s...", buf );
149 			if (isdigit(c) || c == 'e' || c == 'E'
150 			  || c == '.' || c == '+' || c == '-')
151 				*bp++ = c;
152 			else {
153 				unput(c);
154 				break;
155 			}
156 		}
157 		*bp = 0;
158 		strtod(buf, &rem);	/* parse the number */
159 		if (rem == buf) {	/* it wasn't a valid number at all */
160 			buf[1] = 0;	/* return one character as token */
161 			retc = (uschar)buf[0];	/* character is its own type */
162 			unputstr(rem+1); /* put rest back for later */
163 		} else {	/* some prefix was a number */
164 			unputstr(rem);	/* put rest back for later */
165 			rem[0] = 0;	/* truncate buf after number part */
166 			retc = '0';	/* type is number */
167 		}
168 	}
169 	*pbuf = buf;
170 	*psz = sz;
171 	return retc;
172 }
173 
174 int	word(char *);
175 int	string(void);
176 int	regexpr(void);
177 bool	sc	= false;	/* true => return a } right now */
178 bool	reg	= false;	/* true => return a REGEXPR now */
179 
yylex(void)180 int yylex(void)
181 {
182 	int c;
183 	static char *buf = NULL;
184 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
185 
186 	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
187 		FATAL( "out of space in yylex" );
188 	if (sc) {
189 		sc = false;
190 		RET('}');
191 	}
192 	if (reg) {
193 		reg = false;
194 		return regexpr();
195 	}
196 	for (;;) {
197 		c = gettok(&buf, &bufsize);
198 		if (c == 0)
199 			return 0;
200 		if (isalpha(c) || c == '_')
201 			return word(buf);
202 		if (isdigit(c)) {
203 			char *cp = tostring(buf);
204 			double result;
205 
206 			if (is_number(cp, & result))
207 				yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
208 			else
209 				yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
210 			free(cp);
211 			/* should this also have STR set? */
212 			RET(NUMBER);
213 		}
214 
215 		yylval.i = c;
216 		switch (c) {
217 		case '\n':	/* {EOL} */
218 			lineno++;
219 			RET(NL);
220 		case '\r':	/* assume \n is coming */
221 		case ' ':	/* {WS}+ */
222 		case '\t':
223 			break;
224 		case '#':	/* #.* strip comments */
225 			while ((c = input()) != '\n' && c != 0)
226 				;
227 			unput(c);
228 			break;
229 		case ';':
230 			RET(';');
231 		case '\\':
232 			if (peek() == '\n') {
233 				input();
234 				lineno++;
235 			} else if (peek() == '\r') {
236 				input(); input();	/* \n */
237 				lineno++;
238 			} else {
239 				RET(c);
240 			}
241 			break;
242 		case '&':
243 			if (peek() == '&') {
244 				input(); RET(AND);
245 			} else
246 				RET('&');
247 		case '|':
248 			if (peek() == '|') {
249 				input(); RET(BOR);
250 			} else
251 				RET('|');
252 		case '!':
253 			if (peek() == '=') {
254 				input(); yylval.i = NE; RET(NE);
255 			} else if (peek() == '~') {
256 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
257 			} else
258 				RET(NOT);
259 		case '~':
260 			yylval.i = MATCH;
261 			RET(MATCHOP);
262 		case '<':
263 			if (peek() == '=') {
264 				input(); yylval.i = LE; RET(LE);
265 			} else {
266 				yylval.i = LT; RET(LT);
267 			}
268 		case '=':
269 			if (peek() == '=') {
270 				input(); yylval.i = EQ; RET(EQ);
271 			} else {
272 				yylval.i = ASSIGN; RET(ASGNOP);
273 			}
274 		case '>':
275 			if (peek() == '=') {
276 				input(); yylval.i = GE; RET(GE);
277 			} else if (peek() == '>') {
278 				input(); yylval.i = APPEND; RET(APPEND);
279 			} else {
280 				yylval.i = GT; RET(GT);
281 			}
282 		case '+':
283 			if (peek() == '+') {
284 				input(); yylval.i = INCR; RET(INCR);
285 			} else if (peek() == '=') {
286 				input(); yylval.i = ADDEQ; RET(ASGNOP);
287 			} else
288 				RET('+');
289 		case '-':
290 			if (peek() == '-') {
291 				input(); yylval.i = DECR; RET(DECR);
292 			} else if (peek() == '=') {
293 				input(); yylval.i = SUBEQ; RET(ASGNOP);
294 			} else
295 				RET('-');
296 		case '*':
297 			if (peek() == '=') {	/* *= */
298 				input(); yylval.i = MULTEQ; RET(ASGNOP);
299 			} else if (peek() == '*') {	/* ** or **= */
300 				input();	/* eat 2nd * */
301 				if (peek() == '=') {
302 					input(); yylval.i = POWEQ; RET(ASGNOP);
303 				} else {
304 					RET(POWER);
305 				}
306 			} else
307 				RET('*');
308 		case '/':
309 			RET('/');
310 		case '%':
311 			if (peek() == '=') {
312 				input(); yylval.i = MODEQ; RET(ASGNOP);
313 			} else
314 				RET('%');
315 		case '^':
316 			if (peek() == '=') {
317 				input(); yylval.i = POWEQ; RET(ASGNOP);
318 			} else
319 				RET(POWER);
320 
321 		case '$':
322 			/* BUG: awkward, if not wrong */
323 			c = gettok(&buf, &bufsize);
324 			if (isalpha(c)) {
325 				if (strcmp(buf, "NF") == 0) {	/* very special */
326 					unputstr("(NF)");
327 					RET(INDIRECT);
328 				}
329 				c = peek();
330 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
331 					unputstr(buf);
332 					RET(INDIRECT);
333 				}
334 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
335 				RET(IVAR);
336 			} else if (c == 0) {	/*  */
337 				SYNTAX( "unexpected end of input after $" );
338 				RET(';');
339 			} else {
340 				unputstr(buf);
341 				RET(INDIRECT);
342 			}
343 
344 		case '}':
345 			if (--bracecnt < 0)
346 				SYNTAX( "extra }" );
347 			sc = true;
348 			RET(';');
349 		case ']':
350 			if (--brackcnt < 0)
351 				SYNTAX( "extra ]" );
352 			RET(']');
353 		case ')':
354 			if (--parencnt < 0)
355 				SYNTAX( "extra )" );
356 			RET(')');
357 		case '{':
358 			bracecnt++;
359 			RET('{');
360 		case '[':
361 			brackcnt++;
362 			RET('[');
363 		case '(':
364 			parencnt++;
365 			RET('(');
366 
367 		case '"':
368 			return string();	/* BUG: should be like tran.c ? */
369 
370 		default:
371 			RET(c);
372 		}
373 	}
374 }
375 
376 extern int runetochar(char *str, int c);
377 
string(void)378 int string(void)
379 {
380 	int c, n;
381 	char *s, *bp;
382 	static char *buf = NULL;
383 	static int bufsz = 500;
384 
385 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
386 		FATAL("out of space for strings");
387 	for (bp = buf; (c = input()) != '"'; ) {
388 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
389 			FATAL("out of space for string %.10s...", buf);
390 		switch (c) {
391 		case '\n':
392 		case '\r':
393 		case 0:
394 			*bp = '\0';
395 			SYNTAX( "non-terminated string %.10s...", buf );
396 			if (c == 0)	/* hopeless */
397 				FATAL( "giving up" );
398 			lineno++;
399 			break;
400 		case '\\':
401 			c = input();
402 			switch (c) {
403 			case '\n': break;
404 			case '"': *bp++ = '"'; break;
405 			case 'n': *bp++ = '\n'; break;
406 			case 't': *bp++ = '\t'; break;
407 			case 'f': *bp++ = '\f'; break;
408 			case 'r': *bp++ = '\r'; break;
409 			case 'b': *bp++ = '\b'; break;
410 			case 'v': *bp++ = '\v'; break;
411 			case 'a': *bp++ = '\a'; break;
412 			case '\\': *bp++ = '\\'; break;
413 
414 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
415 			case '3': case '4': case '5': case '6': case '7':
416 				n = c - '0';
417 				if ((c = peek()) >= '0' && c < '8') {
418 					n = 8 * n + input() - '0';
419 					if ((c = peek()) >= '0' && c < '8')
420 						n = 8 * n + input() - '0';
421 				}
422 				*bp++ = n;
423 				break;
424 
425 			case 'x':	/* hex  \x0-9a-fA-F (exactly two) */
426 			    {
427 				int i;
428 
429 				if (!isxdigit(peek())) {
430 					unput(c);
431 					break;
432 				}
433 				n = 0;
434 				for (i = 0; i < 2; i++) {
435 					c = input();
436 					if (c == 0)
437 						break;
438 					if (isxdigit(c)) {
439 						c = tolower(c);
440 						n *= 16;
441 						if (isdigit(c))
442 							n += (c - '0');
443 						else
444 							n += 10 + (c - 'a');
445 					} else {
446 						unput(c);
447 						break;
448 					}
449 				}
450 				if (i)
451 					*bp++ = n;
452 				break;
453 			    }
454 
455 			case 'u':	/* utf  \u0-9a-fA-F (1..8) */
456 			    {
457 				int i;
458 
459 				n = 0;
460 				for (i = 0; i < 8; i++) {
461 					c = input();
462 					if (!isxdigit(c) || c == 0)
463 						break;
464 					c = tolower(c);
465 					n *= 16;
466 					if (isdigit(c))
467 						n += (c - '0');
468 					else
469 						n += 10 + (c - 'a');
470 				}
471 				unput(c);
472 				bp += runetochar(bp, n);
473 				break;
474 			    }
475 
476 			default:
477 				*bp++ = c;
478 				break;
479 			}
480 			break;
481 		default:
482 			*bp++ = c;
483 			break;
484 		}
485 	}
486 	*bp = 0;
487 	s = tostring(buf);
488 	*bp++ = ' '; *bp++ = '\0';
489 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
490 	free(s);
491 	RET(STRING);
492 }
493 
494 
binsearch(char * w,const Keyword * kp,int n)495 static int binsearch(char *w, const Keyword *kp, int n)
496 {
497 	int cond, low, mid, high;
498 
499 	low = 0;
500 	high = n - 1;
501 	while (low <= high) {
502 		mid = (low + high) / 2;
503 		if ((cond = strcmp(w, kp[mid].word)) < 0)
504 			high = mid - 1;
505 		else if (cond > 0)
506 			low = mid + 1;
507 		else
508 			return mid;
509 	}
510 	return -1;
511 }
512 
word(char * w)513 int word(char *w)
514 {
515 	const Keyword *kp;
516 	int c, n;
517 
518 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
519 	if (n != -1) {	/* found in table */
520 		kp = keywords + n;
521 		yylval.i = kp->sub;
522 		switch (kp->type) {	/* special handling */
523 		case BLTIN:
524 			if (kp->sub == FSYSTEM && safe)
525 				SYNTAX( "system is unsafe" );
526 			RET(kp->type);
527 		case FUNC:
528 			if (infunc)
529 				SYNTAX( "illegal nested function" );
530 			RET(kp->type);
531 		case RETURN:
532 			if (!infunc)
533 				SYNTAX( "return not in function" );
534 			RET(kp->type);
535 		case VARNF:
536 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
537 			RET(VARNF);
538 		default:
539 			RET(kp->type);
540 		}
541 	}
542 	c = peek();	/* look for '(' */
543 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
544 		yylval.i = n;
545 		RET(ARG);
546 	} else {
547 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
548 		if (c == '(') {
549 			RET(CALL);
550 		} else {
551 			RET(VAR);
552 		}
553 	}
554 }
555 
startreg(void)556 void startreg(void)	/* next call to yylex will return a regular expression */
557 {
558 	reg = true;
559 }
560 
regexpr(void)561 int regexpr(void)
562 {
563 	int c;
564 	static char *buf = NULL;
565 	static int bufsz = 500;
566 	char *bp;
567 
568 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
569 		FATAL("out of space for reg expr");
570 	bp = buf;
571 	for ( ; (c = input()) != '/' && c != 0; ) {
572 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
573 			FATAL("out of space for reg expr %.10s...", buf);
574 		if (c == '\n') {
575 			*bp = '\0';
576 			SYNTAX( "newline in regular expression %.10s...", buf );
577 			unput('\n');
578 			break;
579 		} else if (c == '\\') {
580 			*bp++ = '\\';
581 			*bp++ = input();
582 		} else {
583 			*bp++ = c;
584 		}
585 	}
586 	*bp = 0;
587 	if (c == 0)
588 		SYNTAX("non-terminated regular expression %.10s...", buf);
589 	yylval.s = tostring(buf);
590 	unput('/');
591 	RET(REGEXPR);
592 }
593 
594 /* low-level lexical stuff, sort of inherited from lex */
595 
596 char	ebuf[300];
597 char	*ep = ebuf;
598 char	yysbuf[100];	/* pushback buffer */
599 char	*yysptr = yysbuf;
600 FILE	*yyin = NULL;
601 
input(void)602 int input(void)	/* get next lexical input character */
603 {
604 	int c;
605 	extern char *lexprog;
606 
607 	if (yysptr > yysbuf)
608 		c = (uschar)*--yysptr;
609 	else if (lexprog != NULL) {	/* awk '...' */
610 		if ((c = (uschar)*lexprog) != 0)
611 			lexprog++;
612 	} else				/* awk -f ... */
613 		c = pgetc();
614 	if (c == EOF)
615 		c = 0;
616 	if (ep >= ebuf + sizeof ebuf)
617 		ep = ebuf;
618 	*ep = c;
619 	if (c != 0) {
620 		ep++;
621 	}
622 	return (c);
623 }
624 
unput(int c)625 void unput(int c)	/* put lexical character back on input */
626 {
627 	if (yysptr >= yysbuf + sizeof(yysbuf))
628 		FATAL("pushed back too much: %.20s...", yysbuf);
629 	*yysptr++ = c;
630 	if (--ep < ebuf)
631 		ep = ebuf + sizeof(ebuf) - 1;
632 }
633 
unputstr(const char * s)634 void unputstr(const char *s)	/* put a string back on input */
635 {
636 	int i;
637 
638 	for (i = strlen(s)-1; i >= 0; i--)
639 		unput(s[i]);
640 }
641