xref: /freebsd/contrib/one-true-awk/lex.c (revision faf25f48d601ae39f5752602f3020e2e92605625)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "awkgram.tab.h"
31 
32 extern YYSTYPE	yylval;
33 extern bool	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 const Keyword keywords[] = {	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "and",	FAND,		BLTIN },
51 	{ "atan2",	FATAN,		BLTIN },
52 	{ "break",	BREAK,		BREAK },
53 	{ "close",	CLOSE,		CLOSE },
54 	{ "compl",	FCOMPL,		BLTIN },
55 	{ "continue",	CONTINUE,	CONTINUE },
56 	{ "cos",	FCOS,		BLTIN },
57 	{ "delete",	DELETE,		DELETE },
58 	{ "do",		DO,		DO },
59 	{ "else",	ELSE,		ELSE },
60 	{ "exit",	EXIT,		EXIT },
61 	{ "exp",	FEXP,		BLTIN },
62 	{ "fflush",	FFLUSH,		BLTIN },
63 	{ "for",	FOR,		FOR },
64 	{ "func",	FUNC,		FUNC },
65 	{ "function",	FUNC,		FUNC },
66 	{ "gensub",	GENSUB,		GENSUB },
67 	{ "getline",	GETLINE,	GETLINE },
68 	{ "gsub",	GSUB,		GSUB },
69 	{ "if",		IF,		IF },
70 	{ "in",		IN,		IN },
71 	{ "index",	INDEX,		INDEX },
72 	{ "int",	FINT,		BLTIN },
73 	{ "length",	FLENGTH,	BLTIN },
74 	{ "log",	FLOG,		BLTIN },
75 	{ "lshift",	FLSHIFT,	BLTIN },
76 	{ "match",	MATCHFCN,	MATCHFCN },
77 	{ "next",	NEXT,		NEXT },
78 	{ "nextfile",	NEXTFILE,	NEXTFILE },
79 	{ "or",		FFOR,		BLTIN },
80 	{ "print",	PRINT,		PRINT },
81 	{ "printf",	PRINTF,		PRINTF },
82 	{ "rand",	FRAND,		BLTIN },
83 	{ "return",	RETURN,		RETURN },
84 	{ "rshift",	FRSHIFT,	BLTIN },
85 	{ "sin",	FSIN,		BLTIN },
86 	{ "split",	SPLIT,		SPLIT },
87 	{ "sprintf",	SPRINTF,	SPRINTF },
88 	{ "sqrt",	FSQRT,		BLTIN },
89 	{ "srand",	FSRAND,		BLTIN },
90 	{ "strftime",	FSTRFTIME,	BLTIN },
91 	{ "sub",	SUB,		SUB },
92 	{ "substr",	SUBSTR,		SUBSTR },
93 	{ "system",	FSYSTEM,	BLTIN },
94 	{ "systime",	FSYSTIME,	BLTIN },
95 	{ "tolower",	FTOLOWER,	BLTIN },
96 	{ "toupper",	FTOUPPER,	BLTIN },
97 	{ "while",	WHILE,		WHILE },
98 	{ "xor",	FXOR,		BLTIN },
99 };
100 
101 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
102 
103 static int peek(void)
104 {
105 	int c = input();
106 	unput(c);
107 	return c;
108 }
109 
110 static int gettok(char **pbuf, int *psz)	/* get next input token */
111 {
112 	int c, retc;
113 	char *buf = *pbuf;
114 	int sz = *psz;
115 	char *bp = buf;
116 
117 	c = input();
118 	if (c == 0)
119 		return 0;
120 	buf[0] = c;
121 	buf[1] = 0;
122 	if (!isalnum(c) && c != '.' && c != '_')
123 		return c;
124 
125 	*bp++ = c;
126 	if (isalpha(c) || c == '_') {	/* it's a varname */
127 		for ( ; (c = input()) != 0; ) {
128 			if (bp-buf >= sz)
129 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
130 					FATAL( "out of space for name %.10s...", buf );
131 			if (isalnum(c) || c == '_')
132 				*bp++ = c;
133 			else {
134 				*bp = 0;
135 				unput(c);
136 				break;
137 			}
138 		}
139 		*bp = 0;
140 		retc = 'a';	/* alphanumeric */
141 	} else {	/* maybe it's a number, but could be . */
142 		char *rem;
143 		/* read input until can't be a number */
144 		for ( ; (c = input()) != 0; ) {
145 			if (bp-buf >= sz)
146 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
147 					FATAL( "out of space for number %.10s...", buf );
148 			if (isdigit(c) || c == 'e' || c == 'E'
149 			  || c == '.' || c == '+' || c == '-')
150 				*bp++ = c;
151 			else {
152 				unput(c);
153 				break;
154 			}
155 		}
156 		*bp = 0;
157 		strtod(buf, &rem);	/* parse the number */
158 		if (rem == buf) {	/* it wasn't a valid number at all */
159 			buf[1] = 0;	/* return one character as token */
160 			retc = (uschar)buf[0];	/* character is its own type */
161 			unputstr(rem+1); /* put rest back for later */
162 		} else {	/* some prefix was a number */
163 			unputstr(rem);	/* put rest back for later */
164 			rem[0] = 0;	/* truncate buf after number part */
165 			retc = '0';	/* type is number */
166 		}
167 	}
168 	*pbuf = buf;
169 	*psz = sz;
170 	return retc;
171 }
172 
173 int	word(char *);
174 int	string(void);
175 int	regexpr(void);
176 bool	sc	= false;	/* true => return a } right now */
177 bool	reg	= false;	/* true => return a REGEXPR now */
178 
179 int yylex(void)
180 {
181 	int c;
182 	static char *buf = NULL;
183 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
184 
185 	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
186 		FATAL( "out of space in yylex" );
187 	if (sc) {
188 		sc = false;
189 		RET('}');
190 	}
191 	if (reg) {
192 		reg = false;
193 		return regexpr();
194 	}
195 	for (;;) {
196 		c = gettok(&buf, &bufsize);
197 		if (c == 0)
198 			return 0;
199 		if (isalpha(c) || c == '_')
200 			return word(buf);
201 		if (isdigit(c)) {
202 			char *cp = tostring(buf);
203 			double result;
204 
205 			if (is_number(cp, & result))
206 				yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
207 			else
208 				yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
209 			free(cp);
210 			/* should this also have STR set? */
211 			RET(NUMBER);
212 		}
213 
214 		yylval.i = c;
215 		switch (c) {
216 		case '\n':	/* {EOL} */
217 			lineno++;
218 			RET(NL);
219 		case '\r':	/* assume \n is coming */
220 		case ' ':	/* {WS}+ */
221 		case '\t':
222 			break;
223 		case '#':	/* #.* strip comments */
224 			while ((c = input()) != '\n' && c != 0)
225 				;
226 			unput(c);
227 			/*
228 			 * Next line is a hack, itcompensates for
229 			 * unput's treatment of \n.
230 			 */
231 			lineno++;
232 			break;
233 		case ';':
234 			RET(';');
235 		case '\\':
236 			if (peek() == '\n') {
237 				input();
238 				lineno++;
239 			} else if (peek() == '\r') {
240 				input(); input();	/* \n */
241 				lineno++;
242 			} else {
243 				RET(c);
244 			}
245 			break;
246 		case '&':
247 			if (peek() == '&') {
248 				input(); RET(AND);
249 			} else
250 				RET('&');
251 		case '|':
252 			if (peek() == '|') {
253 				input(); RET(BOR);
254 			} else
255 				RET('|');
256 		case '!':
257 			if (peek() == '=') {
258 				input(); yylval.i = NE; RET(NE);
259 			} else if (peek() == '~') {
260 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
261 			} else
262 				RET(NOT);
263 		case '~':
264 			yylval.i = MATCH;
265 			RET(MATCHOP);
266 		case '<':
267 			if (peek() == '=') {
268 				input(); yylval.i = LE; RET(LE);
269 			} else {
270 				yylval.i = LT; RET(LT);
271 			}
272 		case '=':
273 			if (peek() == '=') {
274 				input(); yylval.i = EQ; RET(EQ);
275 			} else {
276 				yylval.i = ASSIGN; RET(ASGNOP);
277 			}
278 		case '>':
279 			if (peek() == '=') {
280 				input(); yylval.i = GE; RET(GE);
281 			} else if (peek() == '>') {
282 				input(); yylval.i = APPEND; RET(APPEND);
283 			} else {
284 				yylval.i = GT; RET(GT);
285 			}
286 		case '+':
287 			if (peek() == '+') {
288 				input(); yylval.i = INCR; RET(INCR);
289 			} else if (peek() == '=') {
290 				input(); yylval.i = ADDEQ; RET(ASGNOP);
291 			} else
292 				RET('+');
293 		case '-':
294 			if (peek() == '-') {
295 				input(); yylval.i = DECR; RET(DECR);
296 			} else if (peek() == '=') {
297 				input(); yylval.i = SUBEQ; RET(ASGNOP);
298 			} else
299 				RET('-');
300 		case '*':
301 			if (peek() == '=') {	/* *= */
302 				input(); yylval.i = MULTEQ; RET(ASGNOP);
303 			} else if (peek() == '*') {	/* ** or **= */
304 				input();	/* eat 2nd * */
305 				if (peek() == '=') {
306 					input(); yylval.i = POWEQ; RET(ASGNOP);
307 				} else {
308 					RET(POWER);
309 				}
310 			} else
311 				RET('*');
312 		case '/':
313 			RET('/');
314 		case '%':
315 			if (peek() == '=') {
316 				input(); yylval.i = MODEQ; RET(ASGNOP);
317 			} else
318 				RET('%');
319 		case '^':
320 			if (peek() == '=') {
321 				input(); yylval.i = POWEQ; RET(ASGNOP);
322 			} else
323 				RET(POWER);
324 
325 		case '$':
326 			/* BUG: awkward, if not wrong */
327 			c = gettok(&buf, &bufsize);
328 			if (isalpha(c)) {
329 				if (strcmp(buf, "NF") == 0) {	/* very special */
330 					unputstr("(NF)");
331 					RET(INDIRECT);
332 				}
333 				c = peek();
334 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
335 					unputstr(buf);
336 					RET(INDIRECT);
337 				}
338 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
339 				RET(IVAR);
340 			} else if (c == 0) {	/*  */
341 				SYNTAX( "unexpected end of input after $" );
342 				RET(';');
343 			} else {
344 				unputstr(buf);
345 				RET(INDIRECT);
346 			}
347 
348 		case '}':
349 			if (--bracecnt < 0)
350 				SYNTAX( "extra }" );
351 			sc = true;
352 			RET(';');
353 		case ']':
354 			if (--brackcnt < 0)
355 				SYNTAX( "extra ]" );
356 			RET(']');
357 		case ')':
358 			if (--parencnt < 0)
359 				SYNTAX( "extra )" );
360 			RET(')');
361 		case '{':
362 			bracecnt++;
363 			RET('{');
364 		case '[':
365 			brackcnt++;
366 			RET('[');
367 		case '(':
368 			parencnt++;
369 			RET('(');
370 
371 		case '"':
372 			return string();	/* BUG: should be like tran.c ? */
373 
374 		default:
375 			RET(c);
376 		}
377 	}
378 }
379 
380 int string(void)
381 {
382 	int c, n;
383 	char *s, *bp;
384 	static char *buf = NULL;
385 	static int bufsz = 500;
386 
387 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
388 		FATAL("out of space for strings");
389 	for (bp = buf; (c = input()) != '"'; ) {
390 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
391 			FATAL("out of space for string %.10s...", buf);
392 		switch (c) {
393 		case '\n':
394 		case '\r':
395 		case 0:
396 			*bp = '\0';
397 			SYNTAX( "non-terminated string %.10s...", buf );
398 			if (c == 0)	/* hopeless */
399 				FATAL( "giving up" );
400 			lineno++;
401 			break;
402 		case '\\':
403 			c = input();
404 			switch (c) {
405 			case '\n': break;
406 			case '"': *bp++ = '"'; break;
407 			case 'n': *bp++ = '\n'; break;
408 			case 't': *bp++ = '\t'; break;
409 			case 'f': *bp++ = '\f'; break;
410 			case 'r': *bp++ = '\r'; break;
411 			case 'b': *bp++ = '\b'; break;
412 			case 'v': *bp++ = '\v'; break;
413 			case 'a': *bp++ = '\a'; break;
414 			case '\\': *bp++ = '\\'; break;
415 
416 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
417 			case '3': case '4': case '5': case '6': case '7':
418 				n = c - '0';
419 				if ((c = peek()) >= '0' && c < '8') {
420 					n = 8 * n + input() - '0';
421 					if ((c = peek()) >= '0' && c < '8')
422 						n = 8 * n + input() - '0';
423 				}
424 				*bp++ = n;
425 				break;
426 
427 			case 'x':	/* hex  \x0-9a-fA-F + */
428 			    {	char xbuf[100], *px;
429 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
430 					if (isdigit(c)
431 					 || (c >= 'a' && c <= 'f')
432 					 || (c >= 'A' && c <= 'F'))
433 						*px++ = c;
434 					else
435 						break;
436 				}
437 				*px = 0;
438 				unput(c);
439 	  			sscanf(xbuf, "%x", (unsigned int *) &n);
440 				*bp++ = n;
441 				break;
442 			    }
443 
444 			default:
445 				*bp++ = c;
446 				break;
447 			}
448 			break;
449 		default:
450 			*bp++ = c;
451 			break;
452 		}
453 	}
454 	*bp = 0;
455 	s = tostring(buf);
456 	*bp++ = ' '; *bp++ = '\0';
457 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
458 	free(s);
459 	RET(STRING);
460 }
461 
462 
463 static int binsearch(char *w, const Keyword *kp, int n)
464 {
465 	int cond, low, mid, high;
466 
467 	low = 0;
468 	high = n - 1;
469 	while (low <= high) {
470 		mid = (low + high) / 2;
471 		if ((cond = strcmp(w, kp[mid].word)) < 0)
472 			high = mid - 1;
473 		else if (cond > 0)
474 			low = mid + 1;
475 		else
476 			return mid;
477 	}
478 	return -1;
479 }
480 
481 int word(char *w)
482 {
483 	const Keyword *kp;
484 	int c, n;
485 
486 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
487 	if (n != -1) {	/* found in table */
488 		kp = keywords + n;
489 		yylval.i = kp->sub;
490 		switch (kp->type) {	/* special handling */
491 		case BLTIN:
492 			if (kp->sub == FSYSTEM && safe)
493 				SYNTAX( "system is unsafe" );
494 			RET(kp->type);
495 		case FUNC:
496 			if (infunc)
497 				SYNTAX( "illegal nested function" );
498 			RET(kp->type);
499 		case RETURN:
500 			if (!infunc)
501 				SYNTAX( "return not in function" );
502 			RET(kp->type);
503 		case VARNF:
504 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
505 			RET(VARNF);
506 		default:
507 			RET(kp->type);
508 		}
509 	}
510 	c = peek();	/* look for '(' */
511 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
512 		yylval.i = n;
513 		RET(ARG);
514 	} else {
515 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
516 		if (c == '(') {
517 			RET(CALL);
518 		} else {
519 			RET(VAR);
520 		}
521 	}
522 }
523 
524 void startreg(void)	/* next call to yylex will return a regular expression */
525 {
526 	reg = true;
527 }
528 
529 int regexpr(void)
530 {
531 	int c;
532 	static char *buf = NULL;
533 	static int bufsz = 500;
534 	char *bp;
535 
536 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
537 		FATAL("out of space for rex expr");
538 	bp = buf;
539 	for ( ; (c = input()) != '/' && c != 0; ) {
540 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
541 			FATAL("out of space for reg expr %.10s...", buf);
542 		if (c == '\n') {
543 			*bp = '\0';
544 			SYNTAX( "newline in regular expression %.10s...", buf );
545 			unput('\n');
546 			break;
547 		} else if (c == '\\') {
548 			*bp++ = '\\';
549 			*bp++ = input();
550 		} else {
551 			*bp++ = c;
552 		}
553 	}
554 	*bp = 0;
555 	if (c == 0)
556 		SYNTAX("non-terminated regular expression %.10s...", buf);
557 	yylval.s = tostring(buf);
558 	unput('/');
559 	RET(REGEXPR);
560 }
561 
562 /* low-level lexical stuff, sort of inherited from lex */
563 
564 char	ebuf[300];
565 char	*ep = ebuf;
566 char	yysbuf[100];	/* pushback buffer */
567 char	*yysptr = yysbuf;
568 FILE	*yyin = NULL;
569 
570 int input(void)	/* get next lexical input character */
571 {
572 	int c;
573 	extern char *lexprog;
574 
575 	if (yysptr > yysbuf)
576 		c = (uschar)*--yysptr;
577 	else if (lexprog != NULL) {	/* awk '...' */
578 		if ((c = (uschar)*lexprog) != 0)
579 			lexprog++;
580 	} else				/* awk -f ... */
581 		c = pgetc();
582 	if (c == EOF)
583 		c = 0;
584 	if (ep >= ebuf + sizeof ebuf)
585 		ep = ebuf;
586 	*ep = c;
587 	if (c != 0) {
588 		ep++;
589 	}
590 	return (c);
591 }
592 
593 void unput(int c)	/* put lexical character back on input */
594 {
595 	if (c == '\n')
596 		lineno--;
597 	if (yysptr >= yysbuf + sizeof(yysbuf))
598 		FATAL("pushed back too much: %.20s...", yysbuf);
599 	*yysptr++ = c;
600 	if (--ep < ebuf)
601 		ep = ebuf + sizeof(ebuf) - 1;
602 }
603 
604 void unputstr(const char *s)	/* put a string back on input */
605 {
606 	int i;
607 
608 	for (i = strlen(s)-1; i >= 0; i--)
609 		unput(s[i]);
610 }
611