xref: /freebsd/contrib/one-true-awk/lex.c (revision df347c8a2e8ac08df4c1a6058c12b9f01c289cff)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
31 
32 extern YYSTYPE	yylval;
33 extern int	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 Keyword keywords[] ={	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "and",	FAND,		BLTIN },
51 	{ "atan2",	FATAN,		BLTIN },
52 	{ "break",	BREAK,		BREAK },
53 	{ "close",	CLOSE,		CLOSE },
54 	{ "compl",	FCOMPL,		BLTIN },
55 	{ "continue",	CONTINUE,	CONTINUE },
56 	{ "cos",	FCOS,		BLTIN },
57 	{ "delete",	DELETE,		DELETE },
58 	{ "do",		DO,		DO },
59 	{ "else",	ELSE,		ELSE },
60 	{ "exit",	EXIT,		EXIT },
61 	{ "exp",	FEXP,		BLTIN },
62 	{ "fflush",	FFLUSH,		BLTIN },
63 	{ "for",	FOR,		FOR },
64 	{ "func",	FUNC,		FUNC },
65 	{ "function",	FUNC,		FUNC },
66 	{ "getline",	GETLINE,	GETLINE },
67 	{ "gsub",	GSUB,		GSUB },
68 	{ "if",		IF,		IF },
69 	{ "in",		IN,		IN },
70 	{ "index",	INDEX,		INDEX },
71 	{ "int",	FINT,		BLTIN },
72 	{ "length",	FLENGTH,	BLTIN },
73 	{ "log",	FLOG,		BLTIN },
74 	{ "lshift",	FLSHIFT,	BLTIN },
75 	{ "match",	MATCHFCN,	MATCHFCN },
76 	{ "next",	NEXT,		NEXT },
77 	{ "nextfile",	NEXTFILE,	NEXTFILE },
78 	{ "or",		FFOR,		BLTIN },
79 	{ "print",	PRINT,		PRINT },
80 	{ "printf",	PRINTF,		PRINTF },
81 	{ "rand",	FRAND,		BLTIN },
82 	{ "return",	RETURN,		RETURN },
83 	{ "rshift",	FRSHIFT,	BLTIN },
84 	{ "sin",	FSIN,		BLTIN },
85 	{ "split",	SPLIT,		SPLIT },
86 	{ "sprintf",	SPRINTF,	SPRINTF },
87 	{ "sqrt",	FSQRT,		BLTIN },
88 	{ "srand",	FSRAND,		BLTIN },
89 	{ "sub",	SUB,		SUB },
90 	{ "substr",	SUBSTR,		SUBSTR },
91 	{ "system",	FSYSTEM,	BLTIN },
92 	{ "tolower",	FTOLOWER,	BLTIN },
93 	{ "toupper",	FTOUPPER,	BLTIN },
94 	{ "while",	WHILE,		WHILE },
95 	{ "xor",	FXOR,		BLTIN },
96 };
97 
98 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
99 
100 int peek(void)
101 {
102 	int c = input();
103 	unput(c);
104 	return c;
105 }
106 
107 int gettok(char **pbuf, int *psz)	/* get next input token */
108 {
109 	int c, retc;
110 	char *buf = *pbuf;
111 	int sz = *psz;
112 	char *bp = buf;
113 
114 	c = input();
115 	if (c == 0)
116 		return 0;
117 	buf[0] = c;
118 	buf[1] = 0;
119 	if (!isalnum(c) && c != '.' && c != '_')
120 		return c;
121 
122 	*bp++ = c;
123 	if (isalpha(c) || c == '_') {	/* it's a varname */
124 		for ( ; (c = input()) != 0; ) {
125 			if (bp-buf >= sz)
126 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
127 					FATAL( "out of space for name %.10s...", buf );
128 			if (isalnum(c) || c == '_')
129 				*bp++ = c;
130 			else {
131 				*bp = 0;
132 				unput(c);
133 				break;
134 			}
135 		}
136 		*bp = 0;
137 		retc = 'a';	/* alphanumeric */
138 	} else {	/* maybe it's a number, but could be . */
139 		char *rem;
140 		/* read input until can't be a number */
141 		for ( ; (c = input()) != 0; ) {
142 			if (bp-buf >= sz)
143 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
144 					FATAL( "out of space for number %.10s...", buf );
145 			if (isdigit(c) || c == 'e' || c == 'E'
146 			  || c == '.' || c == '+' || c == '-')
147 				*bp++ = c;
148 			else {
149 				unput(c);
150 				break;
151 			}
152 		}
153 		*bp = 0;
154 		strtod(buf, &rem);	/* parse the number */
155 		if (rem == buf) {	/* it wasn't a valid number at all */
156 			buf[1] = 0;	/* return one character as token */
157 			retc = buf[0];	/* character is its own type */
158 			unputstr(rem+1); /* put rest back for later */
159 		} else {	/* some prefix was a number */
160 			unputstr(rem);	/* put rest back for later */
161 			rem[0] = 0;	/* truncate buf after number part */
162 			retc = '0';	/* type is number */
163 		}
164 	}
165 	*pbuf = buf;
166 	*psz = sz;
167 	return retc;
168 }
169 
170 int	word(char *);
171 int	string(void);
172 int	regexpr(void);
173 int	sc	= 0;	/* 1 => return a } right now */
174 int	reg	= 0;	/* 1 => return a REGEXPR now */
175 
176 int yylex(void)
177 {
178 	int c;
179 	static char *buf = NULL;
180 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
181 
182 	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
183 		FATAL( "out of space in yylex" );
184 	if (sc) {
185 		sc = 0;
186 		RET('}');
187 	}
188 	if (reg) {
189 		reg = 0;
190 		return regexpr();
191 	}
192 	for (;;) {
193 		c = gettok(&buf, &bufsize);
194 		if (c == 0)
195 			return 0;
196 		if (isalpha(c) || c == '_')
197 			return word(buf);
198 		if (isdigit(c)) {
199 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
200 			/* should this also have STR set? */
201 			RET(NUMBER);
202 		}
203 
204 		yylval.i = c;
205 		switch (c) {
206 		case '\n':	/* {EOL} */
207 			lineno++;
208 			RET(NL);
209 		case '\r':	/* assume \n is coming */
210 		case ' ':	/* {WS}+ */
211 		case '\t':
212 			break;
213 		case '#':	/* #.* strip comments */
214 			while ((c = input()) != '\n' && c != 0)
215 				;
216 			unput(c);
217 			break;
218 		case ';':
219 			RET(';');
220 		case '\\':
221 			if (peek() == '\n') {
222 				input();
223 				lineno++;
224 			} else if (peek() == '\r') {
225 				input(); input();	/* \n */
226 				lineno++;
227 			} else {
228 				RET(c);
229 			}
230 			break;
231 		case '&':
232 			if (peek() == '&') {
233 				input(); RET(AND);
234 			} else
235 				RET('&');
236 		case '|':
237 			if (peek() == '|') {
238 				input(); RET(BOR);
239 			} else
240 				RET('|');
241 		case '!':
242 			if (peek() == '=') {
243 				input(); yylval.i = NE; RET(NE);
244 			} else if (peek() == '~') {
245 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
246 			} else
247 				RET(NOT);
248 		case '~':
249 			yylval.i = MATCH;
250 			RET(MATCHOP);
251 		case '<':
252 			if (peek() == '=') {
253 				input(); yylval.i = LE; RET(LE);
254 			} else {
255 				yylval.i = LT; RET(LT);
256 			}
257 		case '=':
258 			if (peek() == '=') {
259 				input(); yylval.i = EQ; RET(EQ);
260 			} else {
261 				yylval.i = ASSIGN; RET(ASGNOP);
262 			}
263 		case '>':
264 			if (peek() == '=') {
265 				input(); yylval.i = GE; RET(GE);
266 			} else if (peek() == '>') {
267 				input(); yylval.i = APPEND; RET(APPEND);
268 			} else {
269 				yylval.i = GT; RET(GT);
270 			}
271 		case '+':
272 			if (peek() == '+') {
273 				input(); yylval.i = INCR; RET(INCR);
274 			} else if (peek() == '=') {
275 				input(); yylval.i = ADDEQ; RET(ASGNOP);
276 			} else
277 				RET('+');
278 		case '-':
279 			if (peek() == '-') {
280 				input(); yylval.i = DECR; RET(DECR);
281 			} else if (peek() == '=') {
282 				input(); yylval.i = SUBEQ; RET(ASGNOP);
283 			} else
284 				RET('-');
285 		case '*':
286 			if (peek() == '=') {	/* *= */
287 				input(); yylval.i = MULTEQ; RET(ASGNOP);
288 			} else if (peek() == '*') {	/* ** or **= */
289 				input();	/* eat 2nd * */
290 				if (peek() == '=') {
291 					input(); yylval.i = POWEQ; RET(ASGNOP);
292 				} else {
293 					RET(POWER);
294 				}
295 			} else
296 				RET('*');
297 		case '/':
298 			RET('/');
299 		case '%':
300 			if (peek() == '=') {
301 				input(); yylval.i = MODEQ; RET(ASGNOP);
302 			} else
303 				RET('%');
304 		case '^':
305 			if (peek() == '=') {
306 				input(); yylval.i = POWEQ; RET(ASGNOP);
307 			} else
308 				RET(POWER);
309 
310 		case '$':
311 			/* BUG: awkward, if not wrong */
312 			c = gettok(&buf, &bufsize);
313 			if (isalpha(c)) {
314 				if (strcmp(buf, "NF") == 0) {	/* very special */
315 					unputstr("(NF)");
316 					RET(INDIRECT);
317 				}
318 				c = peek();
319 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
320 					unputstr(buf);
321 					RET(INDIRECT);
322 				}
323 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
324 				RET(IVAR);
325 			} else if (c == 0) {	/*  */
326 				SYNTAX( "unexpected end of input after $" );
327 				RET(';');
328 			} else {
329 				unputstr(buf);
330 				RET(INDIRECT);
331 			}
332 
333 		case '}':
334 			if (--bracecnt < 0)
335 				SYNTAX( "extra }" );
336 			sc = 1;
337 			RET(';');
338 		case ']':
339 			if (--brackcnt < 0)
340 				SYNTAX( "extra ]" );
341 			RET(']');
342 		case ')':
343 			if (--parencnt < 0)
344 				SYNTAX( "extra )" );
345 			RET(')');
346 		case '{':
347 			bracecnt++;
348 			RET('{');
349 		case '[':
350 			brackcnt++;
351 			RET('[');
352 		case '(':
353 			parencnt++;
354 			RET('(');
355 
356 		case '"':
357 			return string();	/* BUG: should be like tran.c ? */
358 
359 		default:
360 			RET(c);
361 		}
362 	}
363 }
364 
365 int string(void)
366 {
367 	int c, n;
368 	char *s, *bp;
369 	static char *buf = NULL;
370 	static int bufsz = 500;
371 
372 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
373 		FATAL("out of space for strings");
374 	for (bp = buf; (c = input()) != '"'; ) {
375 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
376 			FATAL("out of space for string %.10s...", buf);
377 		switch (c) {
378 		case '\n':
379 		case '\r':
380 		case 0:
381 			*bp = '\0';
382 			SYNTAX( "non-terminated string %.10s...", buf );
383 			if (c == 0)	/* hopeless */
384 				FATAL( "giving up" );
385 			lineno++;
386 			break;
387 		case '\\':
388 			c = input();
389 			switch (c) {
390 			case '"': *bp++ = '"'; break;
391 			case 'n': *bp++ = '\n'; break;
392 			case 't': *bp++ = '\t'; break;
393 			case 'f': *bp++ = '\f'; break;
394 			case 'r': *bp++ = '\r'; break;
395 			case 'b': *bp++ = '\b'; break;
396 			case 'v': *bp++ = '\v'; break;
397 			case 'a': *bp++ = '\007'; break;
398 			case '\\': *bp++ = '\\'; break;
399 
400 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
401 			case '3': case '4': case '5': case '6': case '7':
402 				n = c - '0';
403 				if ((c = peek()) >= '0' && c < '8') {
404 					n = 8 * n + input() - '0';
405 					if ((c = peek()) >= '0' && c < '8')
406 						n = 8 * n + input() - '0';
407 				}
408 				*bp++ = n;
409 				break;
410 
411 			case 'x':	/* hex  \x0-9a-fA-F + */
412 			    {	char xbuf[100], *px;
413 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
414 					if (isdigit(c)
415 					 || (c >= 'a' && c <= 'f')
416 					 || (c >= 'A' && c <= 'F'))
417 						*px++ = c;
418 					else
419 						break;
420 				}
421 				*px = 0;
422 				unput(c);
423 	  			sscanf(xbuf, "%x", (unsigned int *) &n);
424 				*bp++ = n;
425 				break;
426 			    }
427 
428 			default:
429 				*bp++ = c;
430 				break;
431 			}
432 			break;
433 		default:
434 			*bp++ = c;
435 			break;
436 		}
437 	}
438 	*bp = 0;
439 	s = tostring(buf);
440 	*bp++ = ' '; *bp++ = 0;
441 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
442 	RET(STRING);
443 }
444 
445 
446 int binsearch(char *w, Keyword *kp, int n)
447 {
448 	int cond, low, mid, high;
449 
450 	low = 0;
451 	high = n - 1;
452 	while (low <= high) {
453 		mid = (low + high) / 2;
454 		if ((cond = strcmp(w, kp[mid].word)) < 0)
455 			high = mid - 1;
456 		else if (cond > 0)
457 			low = mid + 1;
458 		else
459 			return mid;
460 	}
461 	return -1;
462 }
463 
464 int word(char *w)
465 {
466 	Keyword *kp;
467 	int c, n;
468 
469 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
470 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
471 	kp = keywords + n;
472 	if (n != -1) {	/* found in table */
473 		yylval.i = kp->sub;
474 		switch (kp->type) {	/* special handling */
475 		case BLTIN:
476 			if (kp->sub == FSYSTEM && safe)
477 				SYNTAX( "system is unsafe" );
478 			RET(kp->type);
479 		case FUNC:
480 			if (infunc)
481 				SYNTAX( "illegal nested function" );
482 			RET(kp->type);
483 		case RETURN:
484 			if (!infunc)
485 				SYNTAX( "return not in function" );
486 			RET(kp->type);
487 		case VARNF:
488 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
489 			RET(VARNF);
490 		default:
491 			RET(kp->type);
492 		}
493 	}
494 	c = peek();	/* look for '(' */
495 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
496 		yylval.i = n;
497 		RET(ARG);
498 	} else {
499 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
500 		if (c == '(') {
501 			RET(CALL);
502 		} else {
503 			RET(VAR);
504 		}
505 	}
506 }
507 
508 void startreg(void)	/* next call to yylex will return a regular expression */
509 {
510 	reg = 1;
511 }
512 
513 int regexpr(void)
514 {
515 	int c;
516 	static char *buf = NULL;
517 	static int bufsz = 500;
518 	char *bp;
519 
520 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
521 		FATAL("out of space for rex expr");
522 	bp = buf;
523 	for ( ; (c = input()) != '/' && c != 0; ) {
524 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
525 			FATAL("out of space for reg expr %.10s...", buf);
526 		if (c == '\n') {
527 			*bp = '\0';
528 			SYNTAX( "newline in regular expression %.10s...", buf );
529 			unput('\n');
530 			break;
531 		} else if (c == '\\') {
532 			*bp++ = '\\';
533 			*bp++ = input();
534 		} else {
535 			*bp++ = c;
536 		}
537 	}
538 	*bp = 0;
539 	if (c == 0)
540 		SYNTAX("non-terminated regular expression %.10s...", buf);
541 	yylval.s = tostring(buf);
542 	unput('/');
543 	RET(REGEXPR);
544 }
545 
546 /* low-level lexical stuff, sort of inherited from lex */
547 
548 char	ebuf[300];
549 char	*ep = ebuf;
550 char	yysbuf[100];	/* pushback buffer */
551 char	*yysptr = yysbuf;
552 FILE	*yyin = NULL;
553 
554 int input(void)	/* get next lexical input character */
555 {
556 	int c;
557 	extern char *lexprog;
558 
559 	if (yysptr > yysbuf)
560 		c = (uschar)*--yysptr;
561 	else if (lexprog != NULL) {	/* awk '...' */
562 		if ((c = (uschar)*lexprog) != 0)
563 			lexprog++;
564 	} else				/* awk -f ... */
565 		c = pgetc();
566 	if (c == EOF)
567 		c = 0;
568 	if (ep >= ebuf + sizeof ebuf)
569 		ep = ebuf;
570 	*ep = c;
571 	if (c != 0) {
572 		ep++;
573 	}
574 	return (c);
575 }
576 
577 void unput(int c)	/* put lexical character back on input */
578 {
579 	if (yysptr >= yysbuf + sizeof(yysbuf))
580 		FATAL("pushed back too much: %.20s...", yysbuf);
581 	*yysptr++ = c;
582 	if (--ep < ebuf)
583 		ep = ebuf + sizeof(ebuf) - 1;
584 }
585 
586 void unputstr(const char *s)	/* put a string back on input */
587 {
588 	int i;
589 
590 	for (i = strlen(s)-1; i >= 0; i--)
591 		unput(s[i]);
592 }
593