xref: /freebsd/contrib/one-true-awk/lex.c (revision cbd30a72ca196976c1c700400ecd424baa1b9c16)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
31 
32 extern YYSTYPE	yylval;
33 extern int	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 Keyword keywords[] ={	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "atan2",	FATAN,		BLTIN },
51 	{ "break",	BREAK,		BREAK },
52 	{ "close",	CLOSE,		CLOSE },
53 	{ "continue",	CONTINUE,	CONTINUE },
54 	{ "cos",	FCOS,		BLTIN },
55 	{ "delete",	DELETE,		DELETE },
56 	{ "do",		DO,		DO },
57 	{ "else",	ELSE,		ELSE },
58 	{ "exit",	EXIT,		EXIT },
59 	{ "exp",	FEXP,		BLTIN },
60 	{ "fflush",	FFLUSH,		BLTIN },
61 	{ "for",	FOR,		FOR },
62 	{ "func",	FUNC,		FUNC },
63 	{ "function",	FUNC,		FUNC },
64 	{ "getline",	GETLINE,	GETLINE },
65 	{ "gsub",	GSUB,		GSUB },
66 	{ "if",		IF,		IF },
67 	{ "in",		IN,		IN },
68 	{ "index",	INDEX,		INDEX },
69 	{ "int",	FINT,		BLTIN },
70 	{ "length",	FLENGTH,	BLTIN },
71 	{ "log",	FLOG,		BLTIN },
72 	{ "match",	MATCHFCN,	MATCHFCN },
73 	{ "next",	NEXT,		NEXT },
74 	{ "nextfile",	NEXTFILE,	NEXTFILE },
75 	{ "print",	PRINT,		PRINT },
76 	{ "printf",	PRINTF,		PRINTF },
77 	{ "rand",	FRAND,		BLTIN },
78 	{ "return",	RETURN,		RETURN },
79 	{ "sin",	FSIN,		BLTIN },
80 	{ "split",	SPLIT,		SPLIT },
81 	{ "sprintf",	SPRINTF,	SPRINTF },
82 	{ "sqrt",	FSQRT,		BLTIN },
83 	{ "srand",	FSRAND,		BLTIN },
84 	{ "sub",	SUB,		SUB },
85 	{ "substr",	SUBSTR,		SUBSTR },
86 	{ "system",	FSYSTEM,	BLTIN },
87 	{ "tolower",	FTOLOWER,	BLTIN },
88 	{ "toupper",	FTOUPPER,	BLTIN },
89 	{ "while",	WHILE,		WHILE },
90 };
91 
92 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
93 
94 int peek(void)
95 {
96 	int c = input();
97 	unput(c);
98 	return c;
99 }
100 
101 int gettok(char **pbuf, int *psz)	/* get next input token */
102 {
103 	int c, retc;
104 	char *buf = *pbuf;
105 	int sz = *psz;
106 	char *bp = buf;
107 
108 	c = input();
109 	if (c == 0)
110 		return 0;
111 	buf[0] = c;
112 	buf[1] = 0;
113 	if (!isalnum(c) && c != '.' && c != '_')
114 		return c;
115 
116 	*bp++ = c;
117 	if (isalpha(c) || c == '_') {	/* it's a varname */
118 		for ( ; (c = input()) != 0; ) {
119 			if (bp-buf >= sz)
120 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
121 					FATAL( "out of space for name %.10s...", buf );
122 			if (isalnum(c) || c == '_')
123 				*bp++ = c;
124 			else {
125 				*bp = 0;
126 				unput(c);
127 				break;
128 			}
129 		}
130 		*bp = 0;
131 		retc = 'a';	/* alphanumeric */
132 	} else {	/* maybe it's a number, but could be . */
133 		char *rem;
134 		/* read input until can't be a number */
135 		for ( ; (c = input()) != 0; ) {
136 			if (bp-buf >= sz)
137 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
138 					FATAL( "out of space for number %.10s...", buf );
139 			if (isdigit(c) || c == 'e' || c == 'E'
140 			  || c == '.' || c == '+' || c == '-')
141 				*bp++ = c;
142 			else {
143 				unput(c);
144 				break;
145 			}
146 		}
147 		*bp = 0;
148 		strtod(buf, &rem);	/* parse the number */
149 		if (rem == buf) {	/* it wasn't a valid number at all */
150 			buf[1] = 0;	/* return one character as token */
151 			retc = buf[0];	/* character is its own type */
152 			unputstr(rem+1); /* put rest back for later */
153 		} else {	/* some prefix was a number */
154 			unputstr(rem);	/* put rest back for later */
155 			rem[0] = 0;	/* truncate buf after number part */
156 			retc = '0';	/* type is number */
157 		}
158 	}
159 	*pbuf = buf;
160 	*psz = sz;
161 	return retc;
162 }
163 
164 int	word(char *);
165 int	string(void);
166 int	regexpr(void);
167 int	sc	= 0;	/* 1 => return a } right now */
168 int	reg	= 0;	/* 1 => return a REGEXPR now */
169 
170 int yylex(void)
171 {
172 	int c;
173 	static char *buf = NULL;
174 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
175 
176 	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
177 		FATAL( "out of space in yylex" );
178 	if (sc) {
179 		sc = 0;
180 		RET('}');
181 	}
182 	if (reg) {
183 		reg = 0;
184 		return regexpr();
185 	}
186 	for (;;) {
187 		c = gettok(&buf, &bufsize);
188 		if (c == 0)
189 			return 0;
190 		if (isalpha(c) || c == '_')
191 			return word(buf);
192 		if (isdigit(c)) {
193 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
194 			/* should this also have STR set? */
195 			RET(NUMBER);
196 		}
197 
198 		yylval.i = c;
199 		switch (c) {
200 		case '\n':	/* {EOL} */
201 			RET(NL);
202 		case '\r':	/* assume \n is coming */
203 		case ' ':	/* {WS}+ */
204 		case '\t':
205 			break;
206 		case '#':	/* #.* strip comments */
207 			while ((c = input()) != '\n' && c != 0)
208 				;
209 			unput(c);
210 			break;
211 		case ';':
212 			RET(';');
213 		case '\\':
214 			if (peek() == '\n') {
215 				input();
216 			} else if (peek() == '\r') {
217 				input(); input();	/* \n */
218 				lineno++;
219 			} else {
220 				RET(c);
221 			}
222 			break;
223 		case '&':
224 			if (peek() == '&') {
225 				input(); RET(AND);
226 			} else
227 				RET('&');
228 		case '|':
229 			if (peek() == '|') {
230 				input(); RET(BOR);
231 			} else
232 				RET('|');
233 		case '!':
234 			if (peek() == '=') {
235 				input(); yylval.i = NE; RET(NE);
236 			} else if (peek() == '~') {
237 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
238 			} else
239 				RET(NOT);
240 		case '~':
241 			yylval.i = MATCH;
242 			RET(MATCHOP);
243 		case '<':
244 			if (peek() == '=') {
245 				input(); yylval.i = LE; RET(LE);
246 			} else {
247 				yylval.i = LT; RET(LT);
248 			}
249 		case '=':
250 			if (peek() == '=') {
251 				input(); yylval.i = EQ; RET(EQ);
252 			} else {
253 				yylval.i = ASSIGN; RET(ASGNOP);
254 			}
255 		case '>':
256 			if (peek() == '=') {
257 				input(); yylval.i = GE; RET(GE);
258 			} else if (peek() == '>') {
259 				input(); yylval.i = APPEND; RET(APPEND);
260 			} else {
261 				yylval.i = GT; RET(GT);
262 			}
263 		case '+':
264 			if (peek() == '+') {
265 				input(); yylval.i = INCR; RET(INCR);
266 			} else if (peek() == '=') {
267 				input(); yylval.i = ADDEQ; RET(ASGNOP);
268 			} else
269 				RET('+');
270 		case '-':
271 			if (peek() == '-') {
272 				input(); yylval.i = DECR; RET(DECR);
273 			} else if (peek() == '=') {
274 				input(); yylval.i = SUBEQ; RET(ASGNOP);
275 			} else
276 				RET('-');
277 		case '*':
278 			if (peek() == '=') {	/* *= */
279 				input(); yylval.i = MULTEQ; RET(ASGNOP);
280 			} else if (peek() == '*') {	/* ** or **= */
281 				input();	/* eat 2nd * */
282 				if (peek() == '=') {
283 					input(); yylval.i = POWEQ; RET(ASGNOP);
284 				} else {
285 					RET(POWER);
286 				}
287 			} else
288 				RET('*');
289 		case '/':
290 			RET('/');
291 		case '%':
292 			if (peek() == '=') {
293 				input(); yylval.i = MODEQ; RET(ASGNOP);
294 			} else
295 				RET('%');
296 		case '^':
297 			if (peek() == '=') {
298 				input(); yylval.i = POWEQ; RET(ASGNOP);
299 			} else
300 				RET(POWER);
301 
302 		case '$':
303 			/* BUG: awkward, if not wrong */
304 			c = gettok(&buf, &bufsize);
305 			if (isalpha(c)) {
306 				if (strcmp(buf, "NF") == 0) {	/* very special */
307 					unputstr("(NF)");
308 					RET(INDIRECT);
309 				}
310 				c = peek();
311 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
312 					unputstr(buf);
313 					RET(INDIRECT);
314 				}
315 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
316 				RET(IVAR);
317 			} else if (c == 0) {	/*  */
318 				SYNTAX( "unexpected end of input after $" );
319 				RET(';');
320 			} else {
321 				unputstr(buf);
322 				RET(INDIRECT);
323 			}
324 
325 		case '}':
326 			if (--bracecnt < 0)
327 				SYNTAX( "extra }" );
328 			sc = 1;
329 			RET(';');
330 		case ']':
331 			if (--brackcnt < 0)
332 				SYNTAX( "extra ]" );
333 			RET(']');
334 		case ')':
335 			if (--parencnt < 0)
336 				SYNTAX( "extra )" );
337 			RET(')');
338 		case '{':
339 			bracecnt++;
340 			RET('{');
341 		case '[':
342 			brackcnt++;
343 			RET('[');
344 		case '(':
345 			parencnt++;
346 			RET('(');
347 
348 		case '"':
349 			return string();	/* BUG: should be like tran.c ? */
350 
351 		default:
352 			RET(c);
353 		}
354 	}
355 }
356 
357 int string(void)
358 {
359 	int c, n;
360 	char *s, *bp;
361 	static char *buf = NULL;
362 	static int bufsz = 500;
363 
364 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
365 		FATAL("out of space for strings");
366 	for (bp = buf; (c = input()) != '"'; ) {
367 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
368 			FATAL("out of space for string %.10s...", buf);
369 		switch (c) {
370 		case '\n':
371 		case '\r':
372 		case 0:
373 			SYNTAX( "non-terminated string %.10s...", buf );
374 			lineno++;
375 			if (c == 0)	/* hopeless */
376 				FATAL( "giving up" );
377 			break;
378 		case '\\':
379 			c = input();
380 			switch (c) {
381 			case '"': *bp++ = '"'; break;
382 			case 'n': *bp++ = '\n'; break;
383 			case 't': *bp++ = '\t'; break;
384 			case 'f': *bp++ = '\f'; break;
385 			case 'r': *bp++ = '\r'; break;
386 			case 'b': *bp++ = '\b'; break;
387 			case 'v': *bp++ = '\v'; break;
388 			case 'a': *bp++ = '\007'; break;
389 			case '\\': *bp++ = '\\'; break;
390 
391 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
392 			case '3': case '4': case '5': case '6': case '7':
393 				n = c - '0';
394 				if ((c = peek()) >= '0' && c < '8') {
395 					n = 8 * n + input() - '0';
396 					if ((c = peek()) >= '0' && c < '8')
397 						n = 8 * n + input() - '0';
398 				}
399 				*bp++ = n;
400 				break;
401 
402 			case 'x':	/* hex  \x0-9a-fA-F + */
403 			    {	char xbuf[100], *px;
404 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
405 					if (isdigit(c)
406 					 || (c >= 'a' && c <= 'f')
407 					 || (c >= 'A' && c <= 'F'))
408 						*px++ = c;
409 					else
410 						break;
411 				}
412 				*px = 0;
413 				unput(c);
414 	  			sscanf(xbuf, "%x", (unsigned int *) &n);
415 				*bp++ = n;
416 				break;
417 			    }
418 
419 			default:
420 				*bp++ = c;
421 				break;
422 			}
423 			break;
424 		default:
425 			*bp++ = c;
426 			break;
427 		}
428 	}
429 	*bp = 0;
430 	s = tostring(buf);
431 	*bp++ = ' '; *bp++ = 0;
432 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
433 	RET(STRING);
434 }
435 
436 
437 int binsearch(char *w, Keyword *kp, int n)
438 {
439 	int cond, low, mid, high;
440 
441 	low = 0;
442 	high = n - 1;
443 	while (low <= high) {
444 		mid = (low + high) / 2;
445 		if ((cond = strcmp(w, kp[mid].word)) < 0)
446 			high = mid - 1;
447 		else if (cond > 0)
448 			low = mid + 1;
449 		else
450 			return mid;
451 	}
452 	return -1;
453 }
454 
455 int word(char *w)
456 {
457 	Keyword *kp;
458 	int c, n;
459 
460 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
461 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
462 	kp = keywords + n;
463 	if (n != -1) {	/* found in table */
464 		yylval.i = kp->sub;
465 		switch (kp->type) {	/* special handling */
466 		case BLTIN:
467 			if (kp->sub == FSYSTEM && safe)
468 				SYNTAX( "system is unsafe" );
469 			RET(kp->type);
470 		case FUNC:
471 			if (infunc)
472 				SYNTAX( "illegal nested function" );
473 			RET(kp->type);
474 		case RETURN:
475 			if (!infunc)
476 				SYNTAX( "return not in function" );
477 			RET(kp->type);
478 		case VARNF:
479 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
480 			RET(VARNF);
481 		default:
482 			RET(kp->type);
483 		}
484 	}
485 	c = peek();	/* look for '(' */
486 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
487 		yylval.i = n;
488 		RET(ARG);
489 	} else {
490 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
491 		if (c == '(') {
492 			RET(CALL);
493 		} else {
494 			RET(VAR);
495 		}
496 	}
497 }
498 
499 void startreg(void)	/* next call to yylex will return a regular expression */
500 {
501 	reg = 1;
502 }
503 
504 int regexpr(void)
505 {
506 	int c;
507 	static char *buf = NULL;
508 	static int bufsz = 500;
509 	char *bp;
510 
511 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
512 		FATAL("out of space for rex expr");
513 	bp = buf;
514 	for ( ; (c = input()) != '/' && c != 0; ) {
515 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
516 			FATAL("out of space for reg expr %.10s...", buf);
517 		if (c == '\n') {
518 			SYNTAX( "newline in regular expression %.10s...", buf );
519 			unput('\n');
520 			break;
521 		} else if (c == '\\') {
522 			*bp++ = '\\';
523 			*bp++ = input();
524 		} else {
525 			*bp++ = c;
526 		}
527 	}
528 	*bp = 0;
529 	if (c == 0)
530 		SYNTAX("non-terminated regular expression %.10s...", buf);
531 	yylval.s = tostring(buf);
532 	unput('/');
533 	RET(REGEXPR);
534 }
535 
536 /* low-level lexical stuff, sort of inherited from lex */
537 
538 char	ebuf[300];
539 char	*ep = ebuf;
540 char	yysbuf[100];	/* pushback buffer */
541 char	*yysptr = yysbuf;
542 FILE	*yyin = NULL;
543 
544 int input(void)	/* get next lexical input character */
545 {
546 	int c;
547 	extern char *lexprog;
548 
549 	if (yysptr > yysbuf)
550 		c = (uschar)*--yysptr;
551 	else if (lexprog != NULL) {	/* awk '...' */
552 		if ((c = (uschar)*lexprog) != 0)
553 			lexprog++;
554 	} else				/* awk -f ... */
555 		c = pgetc();
556 	if (c == '\n')
557 		lineno++;
558 	else if (c == EOF)
559 		c = 0;
560 	if (ep >= ebuf + sizeof ebuf)
561 		ep = ebuf;
562 	return *ep++ = c;
563 }
564 
565 void unput(int c)	/* put lexical character back on input */
566 {
567 	if (c == '\n')
568 		lineno--;
569 	if (yysptr >= yysbuf + sizeof(yysbuf))
570 		FATAL("pushed back too much: %.20s...", yysbuf);
571 	*yysptr++ = c;
572 	if (--ep < ebuf)
573 		ep = ebuf + sizeof(ebuf) - 1;
574 }
575 
576 void unputstr(const char *s)	/* put a string back on input */
577 {
578 	int i;
579 
580 	for (i = strlen(s)-1; i >= 0; i--)
581 		unput(s[i]);
582 }
583