xref: /freebsd/contrib/one-true-awk/lex.c (revision b7f7cc25c01aeacaafb86ebcffdeb258b7933b08)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "awkgram.tab.h"
31 
32 extern YYSTYPE	yylval;
33 extern bool	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 const Keyword keywords[] = {	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "and",	FAND,		BLTIN },
51 	{ "atan2",	FATAN,		BLTIN },
52 	{ "break",	BREAK,		BREAK },
53 	{ "close",	CLOSE,		CLOSE },
54 	{ "compl",	FCOMPL,		BLTIN },
55 	{ "continue",	CONTINUE,	CONTINUE },
56 	{ "cos",	FCOS,		BLTIN },
57 	{ "delete",	DELETE,		DELETE },
58 	{ "do",		DO,		DO },
59 	{ "else",	ELSE,		ELSE },
60 	{ "exit",	EXIT,		EXIT },
61 	{ "exp",	FEXP,		BLTIN },
62 	{ "fflush",	FFLUSH,		BLTIN },
63 	{ "for",	FOR,		FOR },
64 	{ "func",	FUNC,		FUNC },
65 	{ "function",	FUNC,		FUNC },
66 	{ "gensub",	GENSUB,		GENSUB },
67 	{ "getline",	GETLINE,	GETLINE },
68 	{ "gsub",	GSUB,		GSUB },
69 	{ "if",		IF,		IF },
70 	{ "in",		IN,		IN },
71 	{ "index",	INDEX,		INDEX },
72 	{ "int",	FINT,		BLTIN },
73 	{ "length",	FLENGTH,	BLTIN },
74 	{ "log",	FLOG,		BLTIN },
75 	{ "lshift",	FLSHIFT,	BLTIN },
76 	{ "match",	MATCHFCN,	MATCHFCN },
77 	{ "next",	NEXT,		NEXT },
78 	{ "nextfile",	NEXTFILE,	NEXTFILE },
79 	{ "or",		FFOR,		BLTIN },
80 	{ "print",	PRINT,		PRINT },
81 	{ "printf",	PRINTF,		PRINTF },
82 	{ "rand",	FRAND,		BLTIN },
83 	{ "return",	RETURN,		RETURN },
84 	{ "rshift",	FRSHIFT,	BLTIN },
85 	{ "sin",	FSIN,		BLTIN },
86 	{ "split",	SPLIT,		SPLIT },
87 	{ "sprintf",	SPRINTF,	SPRINTF },
88 	{ "sqrt",	FSQRT,		BLTIN },
89 	{ "srand",	FSRAND,		BLTIN },
90 	{ "strftime",	FSTRFTIME,	BLTIN },
91 	{ "sub",	SUB,		SUB },
92 	{ "substr",	SUBSTR,		SUBSTR },
93 	{ "system",	FSYSTEM,	BLTIN },
94 	{ "systime",	FSYSTIME,	BLTIN },
95 	{ "tolower",	FTOLOWER,	BLTIN },
96 	{ "toupper",	FTOUPPER,	BLTIN },
97 	{ "while",	WHILE,		WHILE },
98 	{ "xor",	FXOR,		BLTIN },
99 };
100 
101 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
102 
103 static int peek(void)
104 {
105 	int c = input();
106 	unput(c);
107 	return c;
108 }
109 
110 static int gettok(char **pbuf, int *psz)	/* get next input token */
111 {
112 	int c, retc;
113 	char *buf = *pbuf;
114 	int sz = *psz;
115 	char *bp = buf;
116 
117 	c = input();
118 	if (c == 0)
119 		return 0;
120 	buf[0] = c;
121 	buf[1] = 0;
122 	if (!isalnum(c) && c != '.' && c != '_')
123 		return c;
124 
125 	*bp++ = c;
126 	if (isalpha(c) || c == '_') {	/* it's a varname */
127 		for ( ; (c = input()) != 0; ) {
128 			if (bp-buf >= sz)
129 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
130 					FATAL( "out of space for name %.10s...", buf );
131 			if (isalnum(c) || c == '_')
132 				*bp++ = c;
133 			else {
134 				*bp = 0;
135 				unput(c);
136 				break;
137 			}
138 		}
139 		*bp = 0;
140 		retc = 'a';	/* alphanumeric */
141 	} else {	/* maybe it's a number, but could be . */
142 		char *rem;
143 		/* read input until can't be a number */
144 		for ( ; (c = input()) != 0; ) {
145 			if (bp-buf >= sz)
146 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
147 					FATAL( "out of space for number %.10s...", buf );
148 			if (isdigit(c) || c == 'e' || c == 'E'
149 			  || c == '.' || c == '+' || c == '-')
150 				*bp++ = c;
151 			else {
152 				unput(c);
153 				break;
154 			}
155 		}
156 		*bp = 0;
157 		strtod(buf, &rem);	/* parse the number */
158 		if (rem == buf) {	/* it wasn't a valid number at all */
159 			buf[1] = 0;	/* return one character as token */
160 			retc = (uschar)buf[0];	/* character is its own type */
161 			unputstr(rem+1); /* put rest back for later */
162 		} else {	/* some prefix was a number */
163 			unputstr(rem);	/* put rest back for later */
164 			rem[0] = 0;	/* truncate buf after number part */
165 			retc = '0';	/* type is number */
166 		}
167 	}
168 	*pbuf = buf;
169 	*psz = sz;
170 	return retc;
171 }
172 
173 int	word(char *);
174 int	string(void);
175 int	regexpr(void);
176 bool	sc	= false;	/* true => return a } right now */
177 bool	reg	= false;	/* true => return a REGEXPR now */
178 
179 int yylex(void)
180 {
181 	int c;
182 	static char *buf = NULL;
183 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
184 
185 	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
186 		FATAL( "out of space in yylex" );
187 	if (sc) {
188 		sc = false;
189 		RET('}');
190 	}
191 	if (reg) {
192 		reg = false;
193 		return regexpr();
194 	}
195 	for (;;) {
196 		c = gettok(&buf, &bufsize);
197 		if (c == 0)
198 			return 0;
199 		if (isalpha(c) || c == '_')
200 			return word(buf);
201 		if (isdigit(c)) {
202 			char *cp = tostring(buf);
203 			double result;
204 
205 			if (is_number(cp, & result))
206 				yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
207 			else
208 				yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
209 			free(cp);
210 			/* should this also have STR set? */
211 			RET(NUMBER);
212 		}
213 
214 		yylval.i = c;
215 		switch (c) {
216 		case '\n':	/* {EOL} */
217 			lineno++;
218 			RET(NL);
219 		case '\r':	/* assume \n is coming */
220 		case ' ':	/* {WS}+ */
221 		case '\t':
222 			break;
223 		case '#':	/* #.* strip comments */
224 			while ((c = input()) != '\n' && c != 0)
225 				;
226 			unput(c);
227 			/*
228 			 * Next line is a hack, itcompensates for
229 			 * unput's treatment of \n.
230 			 */
231 			lineno++;
232 			break;
233 		case ';':
234 			RET(';');
235 		case '\\':
236 			if (peek() == '\n') {
237 				input();
238 				lineno++;
239 			} else if (peek() == '\r') {
240 				input(); input();	/* \n */
241 				lineno++;
242 			} else {
243 				RET(c);
244 			}
245 			break;
246 		case '&':
247 			if (peek() == '&') {
248 				input(); RET(AND);
249 			} else
250 				RET('&');
251 		case '|':
252 			if (peek() == '|') {
253 				input(); RET(BOR);
254 			} else
255 				RET('|');
256 		case '!':
257 			if (peek() == '=') {
258 				input(); yylval.i = NE; RET(NE);
259 			} else if (peek() == '~') {
260 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
261 			} else
262 				RET(NOT);
263 		case '~':
264 			yylval.i = MATCH;
265 			RET(MATCHOP);
266 		case '<':
267 			if (peek() == '=') {
268 				input(); yylval.i = LE; RET(LE);
269 			} else {
270 				yylval.i = LT; RET(LT);
271 			}
272 		case '=':
273 			if (peek() == '=') {
274 				input(); yylval.i = EQ; RET(EQ);
275 			} else {
276 				yylval.i = ASSIGN; RET(ASGNOP);
277 			}
278 		case '>':
279 			if (peek() == '=') {
280 				input(); yylval.i = GE; RET(GE);
281 			} else if (peek() == '>') {
282 				input(); yylval.i = APPEND; RET(APPEND);
283 			} else {
284 				yylval.i = GT; RET(GT);
285 			}
286 		case '+':
287 			if (peek() == '+') {
288 				input(); yylval.i = INCR; RET(INCR);
289 			} else if (peek() == '=') {
290 				input(); yylval.i = ADDEQ; RET(ASGNOP);
291 			} else
292 				RET('+');
293 		case '-':
294 			if (peek() == '-') {
295 				input(); yylval.i = DECR; RET(DECR);
296 			} else if (peek() == '=') {
297 				input(); yylval.i = SUBEQ; RET(ASGNOP);
298 			} else
299 				RET('-');
300 		case '*':
301 			if (peek() == '=') {	/* *= */
302 				input(); yylval.i = MULTEQ; RET(ASGNOP);
303 			} else if (peek() == '*') {	/* ** or **= */
304 				input();	/* eat 2nd * */
305 				if (peek() == '=') {
306 					input(); yylval.i = POWEQ; RET(ASGNOP);
307 				} else {
308 					RET(POWER);
309 				}
310 			} else
311 				RET('*');
312 		case '/':
313 			RET('/');
314 		case '%':
315 			if (peek() == '=') {
316 				input(); yylval.i = MODEQ; RET(ASGNOP);
317 			} else
318 				RET('%');
319 		case '^':
320 			if (peek() == '=') {
321 				input(); yylval.i = POWEQ; RET(ASGNOP);
322 			} else
323 				RET(POWER);
324 
325 		case '$':
326 			/* BUG: awkward, if not wrong */
327 			c = gettok(&buf, &bufsize);
328 			if (isalpha(c)) {
329 				if (strcmp(buf, "NF") == 0) {	/* very special */
330 					unputstr("(NF)");
331 					RET(INDIRECT);
332 				}
333 				c = peek();
334 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
335 					unputstr(buf);
336 					RET(INDIRECT);
337 				}
338 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
339 				RET(IVAR);
340 			} else if (c == 0) {	/*  */
341 				SYNTAX( "unexpected end of input after $" );
342 				RET(';');
343 			} else {
344 				unputstr(buf);
345 				RET(INDIRECT);
346 			}
347 
348 		case '}':
349 			if (--bracecnt < 0)
350 				SYNTAX( "extra }" );
351 			sc = true;
352 			RET(';');
353 		case ']':
354 			if (--brackcnt < 0)
355 				SYNTAX( "extra ]" );
356 			RET(']');
357 		case ')':
358 			if (--parencnt < 0)
359 				SYNTAX( "extra )" );
360 			RET(')');
361 		case '{':
362 			bracecnt++;
363 			RET('{');
364 		case '[':
365 			brackcnt++;
366 			RET('[');
367 		case '(':
368 			parencnt++;
369 			RET('(');
370 
371 		case '"':
372 			return string();	/* BUG: should be like tran.c ? */
373 
374 		default:
375 			RET(c);
376 		}
377 	}
378 }
379 
380 extern int runetochar(char *str, int c);
381 
382 int string(void)
383 {
384 	int c, n;
385 	char *s, *bp;
386 	static char *buf = NULL;
387 	static int bufsz = 500;
388 
389 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
390 		FATAL("out of space for strings");
391 	for (bp = buf; (c = input()) != '"'; ) {
392 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
393 			FATAL("out of space for string %.10s...", buf);
394 		switch (c) {
395 		case '\n':
396 		case '\r':
397 		case 0:
398 			*bp = '\0';
399 			SYNTAX( "non-terminated string %.10s...", buf );
400 			if (c == 0)	/* hopeless */
401 				FATAL( "giving up" );
402 			lineno++;
403 			break;
404 		case '\\':
405 			c = input();
406 			switch (c) {
407 			case '\n': break;
408 			case '"': *bp++ = '"'; break;
409 			case 'n': *bp++ = '\n'; break;
410 			case 't': *bp++ = '\t'; break;
411 			case 'f': *bp++ = '\f'; break;
412 			case 'r': *bp++ = '\r'; break;
413 			case 'b': *bp++ = '\b'; break;
414 			case 'v': *bp++ = '\v'; break;
415 			case 'a': *bp++ = '\a'; break;
416 			case '\\': *bp++ = '\\'; break;
417 
418 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
419 			case '3': case '4': case '5': case '6': case '7':
420 				n = c - '0';
421 				if ((c = peek()) >= '0' && c < '8') {
422 					n = 8 * n + input() - '0';
423 					if ((c = peek()) >= '0' && c < '8')
424 						n = 8 * n + input() - '0';
425 				}
426 				*bp++ = n;
427 				break;
428 
429 			case 'x':	/* hex  \x0-9a-fA-F (exactly two) */
430 			    {
431 				int i;
432 
433 				n = 0;
434 				for (i = 1; i <= 2; i++) {
435 					c = input();
436 					if (c == 0)
437 						break;
438 					if (isxdigit(c)) {
439 						c = tolower(c);
440 						n *= 16;
441 						if (isdigit(c))
442 							n += (c - '0');
443 						else
444 							n += 10 + (c - 'a');
445 					} else
446 						break;
447 				}
448 				if (n)
449 					*bp++ = n;
450 				else
451 					unput(c);
452 				break;
453 			    }
454 
455 			case 'u':	/* utf  \u0-9a-fA-F (1..8) */
456 			    {
457 				int i;
458 
459 				n = 0;
460 				for (i = 0; i < 8; i++) {
461 					c = input();
462 					if (!isxdigit(c) || c == 0)
463 						break;
464 					c = tolower(c);
465 					n *= 16;
466 					if (isdigit(c))
467 						n += (c - '0');
468 					else
469 						n += 10 + (c - 'a');
470 				}
471 				unput(c);
472 				bp += runetochar(bp, n);
473 				break;
474 			    }
475 
476 			default:
477 				*bp++ = c;
478 				break;
479 			}
480 			break;
481 		default:
482 			*bp++ = c;
483 			break;
484 		}
485 	}
486 	*bp = 0;
487 	s = tostring(buf);
488 	*bp++ = ' '; *bp++ = '\0';
489 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
490 	free(s);
491 	RET(STRING);
492 }
493 
494 
495 static int binsearch(char *w, const Keyword *kp, int n)
496 {
497 	int cond, low, mid, high;
498 
499 	low = 0;
500 	high = n - 1;
501 	while (low <= high) {
502 		mid = (low + high) / 2;
503 		if ((cond = strcmp(w, kp[mid].word)) < 0)
504 			high = mid - 1;
505 		else if (cond > 0)
506 			low = mid + 1;
507 		else
508 			return mid;
509 	}
510 	return -1;
511 }
512 
513 int word(char *w)
514 {
515 	const Keyword *kp;
516 	int c, n;
517 
518 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
519 	if (n != -1) {	/* found in table */
520 		kp = keywords + n;
521 		yylval.i = kp->sub;
522 		switch (kp->type) {	/* special handling */
523 		case BLTIN:
524 			if (kp->sub == FSYSTEM && safe)
525 				SYNTAX( "system is unsafe" );
526 			RET(kp->type);
527 		case FUNC:
528 			if (infunc)
529 				SYNTAX( "illegal nested function" );
530 			RET(kp->type);
531 		case RETURN:
532 			if (!infunc)
533 				SYNTAX( "return not in function" );
534 			RET(kp->type);
535 		case VARNF:
536 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
537 			RET(VARNF);
538 		default:
539 			RET(kp->type);
540 		}
541 	}
542 	c = peek();	/* look for '(' */
543 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
544 		yylval.i = n;
545 		RET(ARG);
546 	} else {
547 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
548 		if (c == '(') {
549 			RET(CALL);
550 		} else {
551 			RET(VAR);
552 		}
553 	}
554 }
555 
556 void startreg(void)	/* next call to yylex will return a regular expression */
557 {
558 	reg = true;
559 }
560 
561 int regexpr(void)
562 {
563 	int c;
564 	static char *buf = NULL;
565 	static int bufsz = 500;
566 	char *bp;
567 
568 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
569 		FATAL("out of space for reg expr");
570 	bp = buf;
571 	for ( ; (c = input()) != '/' && c != 0; ) {
572 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
573 			FATAL("out of space for reg expr %.10s...", buf);
574 		if (c == '\n') {
575 			*bp = '\0';
576 			SYNTAX( "newline in regular expression %.10s...", buf );
577 			unput('\n');
578 			break;
579 		} else if (c == '\\') {
580 			*bp++ = '\\';
581 			*bp++ = input();
582 		} else {
583 			*bp++ = c;
584 		}
585 	}
586 	*bp = 0;
587 	if (c == 0)
588 		SYNTAX("non-terminated regular expression %.10s...", buf);
589 	yylval.s = tostring(buf);
590 	unput('/');
591 	RET(REGEXPR);
592 }
593 
594 /* low-level lexical stuff, sort of inherited from lex */
595 
596 char	ebuf[300];
597 char	*ep = ebuf;
598 char	yysbuf[100];	/* pushback buffer */
599 char	*yysptr = yysbuf;
600 FILE	*yyin = NULL;
601 
602 int input(void)	/* get next lexical input character */
603 {
604 	int c;
605 	extern char *lexprog;
606 
607 	if (yysptr > yysbuf)
608 		c = (uschar)*--yysptr;
609 	else if (lexprog != NULL) {	/* awk '...' */
610 		if ((c = (uschar)*lexprog) != 0)
611 			lexprog++;
612 	} else				/* awk -f ... */
613 		c = pgetc();
614 	if (c == EOF)
615 		c = 0;
616 	if (ep >= ebuf + sizeof ebuf)
617 		ep = ebuf;
618 	*ep = c;
619 	if (c != 0) {
620 		ep++;
621 	}
622 	return (c);
623 }
624 
625 void unput(int c)	/* put lexical character back on input */
626 {
627 	if (c == '\n')
628 		lineno--;
629 	if (yysptr >= yysbuf + sizeof(yysbuf))
630 		FATAL("pushed back too much: %.20s...", yysbuf);
631 	*yysptr++ = c;
632 	if (--ep < ebuf)
633 		ep = ebuf + sizeof(ebuf) - 1;
634 }
635 
636 void unputstr(const char *s)	/* put a string back on input */
637 {
638 	int i;
639 
640 	for (i = strlen(s)-1; i >= 0; i--)
641 		unput(s[i]);
642 }
643