xref: /freebsd/contrib/one-true-awk/lex.c (revision 024248c933c5741a21c17eda63092f330dd98337)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "awkgram.tab.h"
31 
32 extern YYSTYPE	yylval;
33 extern bool	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 const Keyword keywords[] = {	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "and",	FAND,		BLTIN },
51 	{ "atan2",	FATAN,		BLTIN },
52 	{ "break",	BREAK,		BREAK },
53 	{ "close",	CLOSE,		CLOSE },
54 	{ "compl",	FCOMPL,		BLTIN },
55 	{ "continue",	CONTINUE,	CONTINUE },
56 	{ "cos",	FCOS,		BLTIN },
57 	{ "delete",	DELETE,		DELETE },
58 	{ "do",		DO,		DO },
59 	{ "else",	ELSE,		ELSE },
60 	{ "exit",	EXIT,		EXIT },
61 	{ "exp",	FEXP,		BLTIN },
62 	{ "fflush",	FFLUSH,		BLTIN },
63 	{ "for",	FOR,		FOR },
64 	{ "func",	FUNC,		FUNC },
65 	{ "function",	FUNC,		FUNC },
66 	{ "gensub",	GENSUB,		GENSUB },
67 	{ "getline",	GETLINE,	GETLINE },
68 	{ "gsub",	GSUB,		GSUB },
69 	{ "if",		IF,		IF },
70 	{ "in",		IN,		IN },
71 	{ "index",	INDEX,		INDEX },
72 	{ "int",	FINT,		BLTIN },
73 	{ "length",	FLENGTH,	BLTIN },
74 	{ "log",	FLOG,		BLTIN },
75 	{ "lshift",	FLSHIFT,	BLTIN },
76 	{ "match",	MATCHFCN,	MATCHFCN },
77 	{ "next",	NEXT,		NEXT },
78 	{ "nextfile",	NEXTFILE,	NEXTFILE },
79 	{ "or",		FFOR,		BLTIN },
80 	{ "print",	PRINT,		PRINT },
81 	{ "printf",	PRINTF,		PRINTF },
82 	{ "rand",	FRAND,		BLTIN },
83 	{ "return",	RETURN,		RETURN },
84 	{ "rshift",	FRSHIFT,	BLTIN },
85 	{ "sin",	FSIN,		BLTIN },
86 	{ "split",	SPLIT,		SPLIT },
87 	{ "sprintf",	SPRINTF,	SPRINTF },
88 	{ "sqrt",	FSQRT,		BLTIN },
89 	{ "srand",	FSRAND,		BLTIN },
90 	{ "strftime",	FSTRFTIME,	BLTIN },
91 	{ "sub",	SUB,		SUB },
92 	{ "substr",	SUBSTR,		SUBSTR },
93 	{ "system",	FSYSTEM,	BLTIN },
94 	{ "systime",	FSYSTIME,	BLTIN },
95 	{ "tolower",	FTOLOWER,	BLTIN },
96 	{ "toupper",	FTOUPPER,	BLTIN },
97 	{ "while",	WHILE,		WHILE },
98 	{ "xor",	FXOR,		BLTIN },
99 };
100 
101 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
102 
103 static int peek(void)
104 {
105 	int c = input();
106 	unput(c);
107 	return c;
108 }
109 
110 static int gettok(char **pbuf, int *psz)	/* get next input token */
111 {
112 	int c, retc;
113 	char *buf = *pbuf;
114 	int sz = *psz;
115 	char *bp = buf;
116 
117 	c = input();
118 	if (c == 0)
119 		return 0;
120 	buf[0] = c;
121 	buf[1] = 0;
122 	if (!isalnum(c) && c != '.' && c != '_')
123 		return c;
124 
125 	*bp++ = c;
126 	if (isalpha(c) || c == '_') {	/* it's a varname */
127 		for ( ; (c = input()) != 0; ) {
128 			if (bp-buf >= sz)
129 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
130 					FATAL( "out of space for name %.10s...", buf );
131 			if (isalnum(c) || c == '_')
132 				*bp++ = c;
133 			else {
134 				*bp = 0;
135 				unput(c);
136 				break;
137 			}
138 		}
139 		*bp = 0;
140 		retc = 'a';	/* alphanumeric */
141 	} else {	/* maybe it's a number, but could be . */
142 		char *rem;
143 		/* read input until can't be a number */
144 		for ( ; (c = input()) != 0; ) {
145 			if (bp-buf >= sz)
146 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
147 					FATAL( "out of space for number %.10s...", buf );
148 			if (isdigit(c) || c == 'e' || c == 'E'
149 			  || c == '.' || c == '+' || c == '-')
150 				*bp++ = c;
151 			else {
152 				unput(c);
153 				break;
154 			}
155 		}
156 		*bp = 0;
157 		strtod(buf, &rem);	/* parse the number */
158 		if (rem == buf) {	/* it wasn't a valid number at all */
159 			buf[1] = 0;	/* return one character as token */
160 			retc = (uschar)buf[0];	/* character is its own type */
161 			unputstr(rem+1); /* put rest back for later */
162 		} else {	/* some prefix was a number */
163 			unputstr(rem);	/* put rest back for later */
164 			rem[0] = 0;	/* truncate buf after number part */
165 			retc = '0';	/* type is number */
166 		}
167 	}
168 	*pbuf = buf;
169 	*psz = sz;
170 	return retc;
171 }
172 
173 int	word(char *);
174 int	string(void);
175 int	regexpr(void);
176 bool	sc	= false;	/* true => return a } right now */
177 bool	reg	= false;	/* true => return a REGEXPR now */
178 
179 int yylex(void)
180 {
181 	int c;
182 	static char *buf = NULL;
183 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
184 
185 	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
186 		FATAL( "out of space in yylex" );
187 	if (sc) {
188 		sc = false;
189 		RET('}');
190 	}
191 	if (reg) {
192 		reg = false;
193 		return regexpr();
194 	}
195 	for (;;) {
196 		c = gettok(&buf, &bufsize);
197 		if (c == 0)
198 			return 0;
199 		if (isalpha(c) || c == '_')
200 			return word(buf);
201 		if (isdigit(c)) {
202 			char *cp = tostring(buf);
203 			double result;
204 
205 			if (is_number(cp, & result))
206 				yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
207 			else
208 				yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
209 			free(cp);
210 			/* should this also have STR set? */
211 			RET(NUMBER);
212 		}
213 
214 		yylval.i = c;
215 		switch (c) {
216 		case '\n':	/* {EOL} */
217 			lineno++;
218 			RET(NL);
219 		case '\r':	/* assume \n is coming */
220 		case ' ':	/* {WS}+ */
221 		case '\t':
222 			break;
223 		case '#':	/* #.* strip comments */
224 			while ((c = input()) != '\n' && c != 0)
225 				;
226 			unput(c);
227 			/*
228 			 * Next line is a hack, it compensates for
229 			 * unput's treatment of \n.
230 			 */
231 			lineno++;
232 			break;
233 		case ';':
234 			RET(';');
235 		case '\\':
236 			if (peek() == '\n') {
237 				input();
238 				lineno++;
239 			} else if (peek() == '\r') {
240 				input(); input();	/* \n */
241 				lineno++;
242 			} else {
243 				RET(c);
244 			}
245 			break;
246 		case '&':
247 			if (peek() == '&') {
248 				input(); RET(AND);
249 			} else
250 				RET('&');
251 		case '|':
252 			if (peek() == '|') {
253 				input(); RET(BOR);
254 			} else
255 				RET('|');
256 		case '!':
257 			if (peek() == '=') {
258 				input(); yylval.i = NE; RET(NE);
259 			} else if (peek() == '~') {
260 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
261 			} else
262 				RET(NOT);
263 		case '~':
264 			yylval.i = MATCH;
265 			RET(MATCHOP);
266 		case '<':
267 			if (peek() == '=') {
268 				input(); yylval.i = LE; RET(LE);
269 			} else {
270 				yylval.i = LT; RET(LT);
271 			}
272 		case '=':
273 			if (peek() == '=') {
274 				input(); yylval.i = EQ; RET(EQ);
275 			} else {
276 				yylval.i = ASSIGN; RET(ASGNOP);
277 			}
278 		case '>':
279 			if (peek() == '=') {
280 				input(); yylval.i = GE; RET(GE);
281 			} else if (peek() == '>') {
282 				input(); yylval.i = APPEND; RET(APPEND);
283 			} else {
284 				yylval.i = GT; RET(GT);
285 			}
286 		case '+':
287 			if (peek() == '+') {
288 				input(); yylval.i = INCR; RET(INCR);
289 			} else if (peek() == '=') {
290 				input(); yylval.i = ADDEQ; RET(ASGNOP);
291 			} else
292 				RET('+');
293 		case '-':
294 			if (peek() == '-') {
295 				input(); yylval.i = DECR; RET(DECR);
296 			} else if (peek() == '=') {
297 				input(); yylval.i = SUBEQ; RET(ASGNOP);
298 			} else
299 				RET('-');
300 		case '*':
301 			if (peek() == '=') {	/* *= */
302 				input(); yylval.i = MULTEQ; RET(ASGNOP);
303 			} else if (peek() == '*') {	/* ** or **= */
304 				input();	/* eat 2nd * */
305 				if (peek() == '=') {
306 					input(); yylval.i = POWEQ; RET(ASGNOP);
307 				} else {
308 					RET(POWER);
309 				}
310 			} else
311 				RET('*');
312 		case '/':
313 			RET('/');
314 		case '%':
315 			if (peek() == '=') {
316 				input(); yylval.i = MODEQ; RET(ASGNOP);
317 			} else
318 				RET('%');
319 		case '^':
320 			if (peek() == '=') {
321 				input(); yylval.i = POWEQ; RET(ASGNOP);
322 			} else
323 				RET(POWER);
324 
325 		case '$':
326 			/* BUG: awkward, if not wrong */
327 			c = gettok(&buf, &bufsize);
328 			if (isalpha(c)) {
329 				if (strcmp(buf, "NF") == 0) {	/* very special */
330 					unputstr("(NF)");
331 					RET(INDIRECT);
332 				}
333 				c = peek();
334 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
335 					unputstr(buf);
336 					RET(INDIRECT);
337 				}
338 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
339 				RET(IVAR);
340 			} else if (c == 0) {	/*  */
341 				SYNTAX( "unexpected end of input after $" );
342 				RET(';');
343 			} else {
344 				unputstr(buf);
345 				RET(INDIRECT);
346 			}
347 
348 		case '}':
349 			if (--bracecnt < 0)
350 				SYNTAX( "extra }" );
351 			sc = true;
352 			RET(';');
353 		case ']':
354 			if (--brackcnt < 0)
355 				SYNTAX( "extra ]" );
356 			RET(']');
357 		case ')':
358 			if (--parencnt < 0)
359 				SYNTAX( "extra )" );
360 			RET(')');
361 		case '{':
362 			bracecnt++;
363 			RET('{');
364 		case '[':
365 			brackcnt++;
366 			RET('[');
367 		case '(':
368 			parencnt++;
369 			RET('(');
370 
371 		case '"':
372 			return string();	/* BUG: should be like tran.c ? */
373 
374 		default:
375 			RET(c);
376 		}
377 	}
378 }
379 
380 extern int runetochar(char *str, int c);
381 
382 int string(void)
383 {
384 	int c, n;
385 	char *s, *bp;
386 	static char *buf = NULL;
387 	static int bufsz = 500;
388 
389 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
390 		FATAL("out of space for strings");
391 	for (bp = buf; (c = input()) != '"'; ) {
392 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
393 			FATAL("out of space for string %.10s...", buf);
394 		switch (c) {
395 		case '\n':
396 		case '\r':
397 		case 0:
398 			*bp = '\0';
399 			SYNTAX( "non-terminated string %.10s...", buf );
400 			if (c == 0)	/* hopeless */
401 				FATAL( "giving up" );
402 			lineno++;
403 			break;
404 		case '\\':
405 			c = input();
406 			switch (c) {
407 			case '\n': break;
408 			case '"': *bp++ = '"'; break;
409 			case 'n': *bp++ = '\n'; break;
410 			case 't': *bp++ = '\t'; break;
411 			case 'f': *bp++ = '\f'; break;
412 			case 'r': *bp++ = '\r'; break;
413 			case 'b': *bp++ = '\b'; break;
414 			case 'v': *bp++ = '\v'; break;
415 			case 'a': *bp++ = '\a'; break;
416 			case '\\': *bp++ = '\\'; break;
417 
418 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
419 			case '3': case '4': case '5': case '6': case '7':
420 				n = c - '0';
421 				if ((c = peek()) >= '0' && c < '8') {
422 					n = 8 * n + input() - '0';
423 					if ((c = peek()) >= '0' && c < '8')
424 						n = 8 * n + input() - '0';
425 				}
426 				*bp++ = n;
427 				break;
428 
429 			case 'x':	/* hex  \x0-9a-fA-F (exactly two) */
430 			    {
431 				int i;
432 
433 				if (!isxdigit(peek())) {
434 					unput(c);
435 					break;
436 				}
437 				n = 0;
438 				for (i = 0; i < 2; i++) {
439 					c = input();
440 					if (c == 0)
441 						break;
442 					if (isxdigit(c)) {
443 						c = tolower(c);
444 						n *= 16;
445 						if (isdigit(c))
446 							n += (c - '0');
447 						else
448 							n += 10 + (c - 'a');
449 					} else {
450 						unput(c);
451 						break;
452 					}
453 				}
454 				if (i)
455 					*bp++ = n;
456 				break;
457 			    }
458 
459 			case 'u':	/* utf  \u0-9a-fA-F (1..8) */
460 			    {
461 				int i;
462 
463 				n = 0;
464 				for (i = 0; i < 8; i++) {
465 					c = input();
466 					if (!isxdigit(c) || c == 0)
467 						break;
468 					c = tolower(c);
469 					n *= 16;
470 					if (isdigit(c))
471 						n += (c - '0');
472 					else
473 						n += 10 + (c - 'a');
474 				}
475 				unput(c);
476 				bp += runetochar(bp, n);
477 				break;
478 			    }
479 
480 			default:
481 				*bp++ = c;
482 				break;
483 			}
484 			break;
485 		default:
486 			*bp++ = c;
487 			break;
488 		}
489 	}
490 	*bp = 0;
491 	s = tostring(buf);
492 	*bp++ = ' '; *bp++ = '\0';
493 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
494 	free(s);
495 	RET(STRING);
496 }
497 
498 
499 static int binsearch(char *w, const Keyword *kp, int n)
500 {
501 	int cond, low, mid, high;
502 
503 	low = 0;
504 	high = n - 1;
505 	while (low <= high) {
506 		mid = (low + high) / 2;
507 		if ((cond = strcmp(w, kp[mid].word)) < 0)
508 			high = mid - 1;
509 		else if (cond > 0)
510 			low = mid + 1;
511 		else
512 			return mid;
513 	}
514 	return -1;
515 }
516 
517 int word(char *w)
518 {
519 	const Keyword *kp;
520 	int c, n;
521 
522 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
523 	if (n != -1) {	/* found in table */
524 		kp = keywords + n;
525 		yylval.i = kp->sub;
526 		switch (kp->type) {	/* special handling */
527 		case BLTIN:
528 			if (kp->sub == FSYSTEM && safe)
529 				SYNTAX( "system is unsafe" );
530 			RET(kp->type);
531 		case FUNC:
532 			if (infunc)
533 				SYNTAX( "illegal nested function" );
534 			RET(kp->type);
535 		case RETURN:
536 			if (!infunc)
537 				SYNTAX( "return not in function" );
538 			RET(kp->type);
539 		case VARNF:
540 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
541 			RET(VARNF);
542 		default:
543 			RET(kp->type);
544 		}
545 	}
546 	c = peek();	/* look for '(' */
547 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
548 		yylval.i = n;
549 		RET(ARG);
550 	} else {
551 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
552 		if (c == '(') {
553 			RET(CALL);
554 		} else {
555 			RET(VAR);
556 		}
557 	}
558 }
559 
560 void startreg(void)	/* next call to yylex will return a regular expression */
561 {
562 	reg = true;
563 }
564 
565 int regexpr(void)
566 {
567 	int c;
568 	static char *buf = NULL;
569 	static int bufsz = 500;
570 	char *bp;
571 
572 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
573 		FATAL("out of space for reg expr");
574 	bp = buf;
575 	for ( ; (c = input()) != '/' && c != 0; ) {
576 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
577 			FATAL("out of space for reg expr %.10s...", buf);
578 		if (c == '\n') {
579 			*bp = '\0';
580 			SYNTAX( "newline in regular expression %.10s...", buf );
581 			unput('\n');
582 			break;
583 		} else if (c == '\\') {
584 			*bp++ = '\\';
585 			*bp++ = input();
586 		} else {
587 			*bp++ = c;
588 		}
589 	}
590 	*bp = 0;
591 	if (c == 0)
592 		SYNTAX("non-terminated regular expression %.10s...", buf);
593 	yylval.s = tostring(buf);
594 	unput('/');
595 	RET(REGEXPR);
596 }
597 
598 /* low-level lexical stuff, sort of inherited from lex */
599 
600 char	ebuf[300];
601 char	*ep = ebuf;
602 char	yysbuf[100];	/* pushback buffer */
603 char	*yysptr = yysbuf;
604 FILE	*yyin = NULL;
605 
606 int input(void)	/* get next lexical input character */
607 {
608 	int c;
609 	extern char *lexprog;
610 
611 	if (yysptr > yysbuf)
612 		c = (uschar)*--yysptr;
613 	else if (lexprog != NULL) {	/* awk '...' */
614 		if ((c = (uschar)*lexprog) != 0)
615 			lexprog++;
616 	} else				/* awk -f ... */
617 		c = pgetc();
618 	if (c == EOF)
619 		c = 0;
620 	if (ep >= ebuf + sizeof ebuf)
621 		ep = ebuf;
622 	*ep = c;
623 	if (c != 0) {
624 		ep++;
625 	}
626 	return (c);
627 }
628 
629 void unput(int c)	/* put lexical character back on input */
630 {
631 	if (c == '\n')
632 		lineno--;
633 	if (yysptr >= yysbuf + sizeof(yysbuf))
634 		FATAL("pushed back too much: %.20s...", yysbuf);
635 	*yysptr++ = c;
636 	if (--ep < ebuf)
637 		ep = ebuf + sizeof(ebuf) - 1;
638 }
639 
640 void unputstr(const char *s)	/* put a string back on input */
641 {
642 	int i;
643 
644 	for (i = strlen(s)-1; i >= 0; i--)
645 		unput(s[i]);
646 }
647