xref: /freebsd/contrib/one-true-awk/lex.c (revision a3d9bf49b57923118c339642594246ef73872ee8)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
31 
32 extern YYSTYPE	yylval;
33 extern int	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 Keyword keywords[] ={	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "and",	FAND,		BLTIN },
51 	{ "atan2",	FATAN,		BLTIN },
52 	{ "break",	BREAK,		BREAK },
53 	{ "close",	CLOSE,		CLOSE },
54 	{ "compl",	FCOMPL,		BLTIN },
55 	{ "continue",	CONTINUE,	CONTINUE },
56 	{ "cos",	FCOS,		BLTIN },
57 	{ "delete",	DELETE,		DELETE },
58 	{ "do",		DO,		DO },
59 	{ "else",	ELSE,		ELSE },
60 	{ "exit",	EXIT,		EXIT },
61 	{ "exp",	FEXP,		BLTIN },
62 	{ "fflush",	FFLUSH,		BLTIN },
63 	{ "for",	FOR,		FOR },
64 	{ "func",	FUNC,		FUNC },
65 	{ "function",	FUNC,		FUNC },
66 	{ "getline",	GETLINE,	GETLINE },
67 	{ "gsub",	GSUB,		GSUB },
68 	{ "if",		IF,		IF },
69 	{ "in",		IN,		IN },
70 	{ "index",	INDEX,		INDEX },
71 	{ "int",	FINT,		BLTIN },
72 	{ "length",	FLENGTH,	BLTIN },
73 	{ "log",	FLOG,		BLTIN },
74 	{ "lshift",	FLSHIFT,	BLTIN },
75 	{ "match",	MATCHFCN,	MATCHFCN },
76 	{ "next",	NEXT,		NEXT },
77 	{ "nextfile",	NEXTFILE,	NEXTFILE },
78 	{ "or",		FFOR,		BLTIN },
79 	{ "print",	PRINT,		PRINT },
80 	{ "printf",	PRINTF,		PRINTF },
81 	{ "rand",	FRAND,		BLTIN },
82 	{ "return",	RETURN,		RETURN },
83 	{ "rshift",	FRSHIFT,	BLTIN },
84 	{ "sin",	FSIN,		BLTIN },
85 	{ "split",	SPLIT,		SPLIT },
86 	{ "sprintf",	SPRINTF,	SPRINTF },
87 	{ "sqrt",	FSQRT,		BLTIN },
88 	{ "srand",	FSRAND,		BLTIN },
89 	{ "sub",	SUB,		SUB },
90 	{ "substr",	SUBSTR,		SUBSTR },
91 	{ "system",	FSYSTEM,	BLTIN },
92 	{ "tolower",	FTOLOWER,	BLTIN },
93 	{ "toupper",	FTOUPPER,	BLTIN },
94 	{ "while",	WHILE,		WHILE },
95 	{ "xor",	FXOR,		BLTIN },
96 };
97 
98 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
99 
100 int peek(void)
101 {
102 	int c = input();
103 	unput(c);
104 	return c;
105 }
106 
107 int gettok(char **pbuf, int *psz)	/* get next input token */
108 {
109 	int c, retc;
110 	char *buf = *pbuf;
111 	int sz = *psz;
112 	char *bp = buf;
113 
114 	c = input();
115 	if (c == 0)
116 		return 0;
117 	buf[0] = c;
118 	buf[1] = 0;
119 	if (!isalnum(c) && c != '.' && c != '_')
120 		return c;
121 
122 	*bp++ = c;
123 	if (isalpha(c) || c == '_') {	/* it's a varname */
124 		for ( ; (c = input()) != 0; ) {
125 			if (bp-buf >= sz)
126 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
127 					FATAL( "out of space for name %.10s...", buf );
128 			if (isalnum(c) || c == '_')
129 				*bp++ = c;
130 			else {
131 				*bp = 0;
132 				unput(c);
133 				break;
134 			}
135 		}
136 		*bp = 0;
137 		retc = 'a';	/* alphanumeric */
138 	} else {	/* maybe it's a number, but could be . */
139 		char *rem;
140 		/* read input until can't be a number */
141 		for ( ; (c = input()) != 0; ) {
142 			if (bp-buf >= sz)
143 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
144 					FATAL( "out of space for number %.10s...", buf );
145 			if (isdigit(c) || c == 'e' || c == 'E'
146 			  || c == '.' || c == '+' || c == '-')
147 				*bp++ = c;
148 			else {
149 				unput(c);
150 				break;
151 			}
152 		}
153 		*bp = 0;
154 		strtod(buf, &rem);	/* parse the number */
155 		if (rem == buf) {	/* it wasn't a valid number at all */
156 			buf[1] = 0;	/* return one character as token */
157 			retc = buf[0];	/* character is its own type */
158 			unputstr(rem+1); /* put rest back for later */
159 		} else {	/* some prefix was a number */
160 			unputstr(rem);	/* put rest back for later */
161 			rem[0] = 0;	/* truncate buf after number part */
162 			retc = '0';	/* type is number */
163 		}
164 	}
165 	*pbuf = buf;
166 	*psz = sz;
167 	return retc;
168 }
169 
170 int	word(char *);
171 int	string(void);
172 int	regexpr(void);
173 int	sc	= 0;	/* 1 => return a } right now */
174 int	reg	= 0;	/* 1 => return a REGEXPR now */
175 
176 int yylex(void)
177 {
178 	int c;
179 	static char *buf = NULL;
180 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
181 
182 	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
183 		FATAL( "out of space in yylex" );
184 	if (sc) {
185 		sc = 0;
186 		RET('}');
187 	}
188 	if (reg) {
189 		reg = 0;
190 		return regexpr();
191 	}
192 	for (;;) {
193 		c = gettok(&buf, &bufsize);
194 		if (c == 0)
195 			return 0;
196 		if (isalpha(c) || c == '_')
197 			return word(buf);
198 		if (isdigit(c)) {
199 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
200 			/* should this also have STR set? */
201 			RET(NUMBER);
202 		}
203 
204 		yylval.i = c;
205 		switch (c) {
206 		case '\n':	/* {EOL} */
207 			lineno++;
208 			RET(NL);
209 		case '\r':	/* assume \n is coming */
210 		case ' ':	/* {WS}+ */
211 		case '\t':
212 			break;
213 		case '#':	/* #.* strip comments */
214 			while ((c = input()) != '\n' && c != 0)
215 				;
216 			unput(c);
217 			break;
218 		case ';':
219 			RET(';');
220 		case '\\':
221 			if (peek() == '\n') {
222 				input();
223 				lineno++;
224 			} else if (peek() == '\r') {
225 				input(); input();	/* \n */
226 				lineno++;
227 			} else {
228 				RET(c);
229 			}
230 			break;
231 		case '&':
232 			if (peek() == '&') {
233 				input(); RET(AND);
234 			} else
235 				RET('&');
236 		case '|':
237 			if (peek() == '|') {
238 				input(); RET(BOR);
239 			} else
240 				RET('|');
241 		case '!':
242 			if (peek() == '=') {
243 				input(); yylval.i = NE; RET(NE);
244 			} else if (peek() == '~') {
245 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
246 			} else
247 				RET(NOT);
248 		case '~':
249 			yylval.i = MATCH;
250 			RET(MATCHOP);
251 		case '<':
252 			if (peek() == '=') {
253 				input(); yylval.i = LE; RET(LE);
254 			} else {
255 				yylval.i = LT; RET(LT);
256 			}
257 		case '=':
258 			if (peek() == '=') {
259 				input(); yylval.i = EQ; RET(EQ);
260 			} else {
261 				yylval.i = ASSIGN; RET(ASGNOP);
262 			}
263 		case '>':
264 			if (peek() == '=') {
265 				input(); yylval.i = GE; RET(GE);
266 			} else if (peek() == '>') {
267 				input(); yylval.i = APPEND; RET(APPEND);
268 			} else {
269 				yylval.i = GT; RET(GT);
270 			}
271 		case '+':
272 			if (peek() == '+') {
273 				input(); yylval.i = INCR; RET(INCR);
274 			} else if (peek() == '=') {
275 				input(); yylval.i = ADDEQ; RET(ASGNOP);
276 			} else
277 				RET('+');
278 		case '-':
279 			if (peek() == '-') {
280 				input(); yylval.i = DECR; RET(DECR);
281 			} else if (peek() == '=') {
282 				input(); yylval.i = SUBEQ; RET(ASGNOP);
283 			} else
284 				RET('-');
285 		case '*':
286 			if (peek() == '=') {	/* *= */
287 				input(); yylval.i = MULTEQ; RET(ASGNOP);
288 			} else if (peek() == '*') {	/* ** or **= */
289 				input();	/* eat 2nd * */
290 				if (peek() == '=') {
291 					input(); yylval.i = POWEQ; RET(ASGNOP);
292 				} else {
293 					RET(POWER);
294 				}
295 			} else
296 				RET('*');
297 		case '/':
298 			RET('/');
299 		case '%':
300 			if (peek() == '=') {
301 				input(); yylval.i = MODEQ; RET(ASGNOP);
302 			} else
303 				RET('%');
304 		case '^':
305 			if (peek() == '=') {
306 				input(); yylval.i = POWEQ; RET(ASGNOP);
307 			} else
308 				RET(POWER);
309 
310 		case '$':
311 			/* BUG: awkward, if not wrong */
312 			c = gettok(&buf, &bufsize);
313 			if (isalpha(c)) {
314 				if (strcmp(buf, "NF") == 0) {	/* very special */
315 					unputstr("(NF)");
316 					RET(INDIRECT);
317 				}
318 				c = peek();
319 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
320 					unputstr(buf);
321 					RET(INDIRECT);
322 				}
323 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
324 				RET(IVAR);
325 			} else if (c == 0) {	/*  */
326 				SYNTAX( "unexpected end of input after $" );
327 				RET(';');
328 			} else {
329 				unputstr(buf);
330 				RET(INDIRECT);
331 			}
332 
333 		case '}':
334 			if (--bracecnt < 0)
335 				SYNTAX( "extra }" );
336 			sc = 1;
337 			RET(';');
338 		case ']':
339 			if (--brackcnt < 0)
340 				SYNTAX( "extra ]" );
341 			RET(']');
342 		case ')':
343 			if (--parencnt < 0)
344 				SYNTAX( "extra )" );
345 			RET(')');
346 		case '{':
347 			bracecnt++;
348 			RET('{');
349 		case '[':
350 			brackcnt++;
351 			RET('[');
352 		case '(':
353 			parencnt++;
354 			RET('(');
355 
356 		case '"':
357 			return string();	/* BUG: should be like tran.c ? */
358 
359 		default:
360 			RET(c);
361 		}
362 	}
363 }
364 
365 int string(void)
366 {
367 	int c, n;
368 	char *s, *bp;
369 	static char *buf = NULL;
370 	static int bufsz = 500;
371 
372 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
373 		FATAL("out of space for strings");
374 	for (bp = buf; (c = input()) != '"'; ) {
375 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
376 			FATAL("out of space for string %.10s...", buf);
377 		switch (c) {
378 		case '\n':
379 		case '\r':
380 		case 0:
381 			*bp = '\0';
382 			SYNTAX( "non-terminated string %.10s...", buf );
383 			if (c == 0)	/* hopeless */
384 				FATAL( "giving up" );
385 			lineno++;
386 			break;
387 		case '\\':
388 			c = input();
389 			switch (c) {
390 			case '"': *bp++ = '"'; break;
391 			case 'n': *bp++ = '\n'; break;
392 			case 't': *bp++ = '\t'; break;
393 			case 'f': *bp++ = '\f'; break;
394 			case 'r': *bp++ = '\r'; break;
395 			case 'b': *bp++ = '\b'; break;
396 			case 'v': *bp++ = '\v'; break;
397 			case 'a': *bp++ = '\007'; break;
398 			case '\\': *bp++ = '\\'; break;
399 
400 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
401 			case '3': case '4': case '5': case '6': case '7':
402 				n = c - '0';
403 				if ((c = peek()) >= '0' && c < '8') {
404 					n = 8 * n + input() - '0';
405 					if ((c = peek()) >= '0' && c < '8')
406 						n = 8 * n + input() - '0';
407 				}
408 				*bp++ = n;
409 				break;
410 
411 			case 'x':	/* hex  \x0-9a-fA-F + */
412 			    {	char xbuf[100], *px;
413 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
414 					if (isdigit(c)
415 					 || (c >= 'a' && c <= 'f')
416 					 || (c >= 'A' && c <= 'F'))
417 						*px++ = c;
418 					else
419 						break;
420 				}
421 				*px = 0;
422 				unput(c);
423 	  			sscanf(xbuf, "%x", (unsigned int *) &n);
424 				*bp++ = n;
425 				break;
426 			    }
427 
428 			default:
429 				*bp++ = c;
430 				break;
431 			}
432 			break;
433 		default:
434 			*bp++ = c;
435 			break;
436 		}
437 	}
438 	*bp = 0;
439 	s = tostring(buf);
440 	*bp++ = ' '; *bp++ = 0;
441 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
442 	RET(STRING);
443 }
444 
445 
446 int binsearch(char *w, Keyword *kp, int n)
447 {
448 	int cond, low, mid, high;
449 
450 	low = 0;
451 	high = n - 1;
452 	while (low <= high) {
453 		mid = (low + high) / 2;
454 		if ((cond = strcmp(w, kp[mid].word)) < 0)
455 			high = mid - 1;
456 		else if (cond > 0)
457 			low = mid + 1;
458 		else
459 			return mid;
460 	}
461 	return -1;
462 }
463 
464 int word(char *w)
465 {
466 	Keyword *kp;
467 	int c, n;
468 
469 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
470 	if (n != -1) {	/* found in table */
471 		kp = keywords + n;
472 		yylval.i = kp->sub;
473 		switch (kp->type) {	/* special handling */
474 		case BLTIN:
475 			if (kp->sub == FSYSTEM && safe)
476 				SYNTAX( "system is unsafe" );
477 			RET(kp->type);
478 		case FUNC:
479 			if (infunc)
480 				SYNTAX( "illegal nested function" );
481 			RET(kp->type);
482 		case RETURN:
483 			if (!infunc)
484 				SYNTAX( "return not in function" );
485 			RET(kp->type);
486 		case VARNF:
487 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
488 			RET(VARNF);
489 		default:
490 			RET(kp->type);
491 		}
492 	}
493 	c = peek();	/* look for '(' */
494 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
495 		yylval.i = n;
496 		RET(ARG);
497 	} else {
498 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
499 		if (c == '(') {
500 			RET(CALL);
501 		} else {
502 			RET(VAR);
503 		}
504 	}
505 }
506 
507 void startreg(void)	/* next call to yylex will return a regular expression */
508 {
509 	reg = 1;
510 }
511 
512 int regexpr(void)
513 {
514 	int c;
515 	static char *buf = NULL;
516 	static int bufsz = 500;
517 	char *bp;
518 
519 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
520 		FATAL("out of space for rex expr");
521 	bp = buf;
522 	for ( ; (c = input()) != '/' && c != 0; ) {
523 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
524 			FATAL("out of space for reg expr %.10s...", buf);
525 		if (c == '\n') {
526 			*bp = '\0';
527 			SYNTAX( "newline in regular expression %.10s...", buf );
528 			unput('\n');
529 			break;
530 		} else if (c == '\\') {
531 			*bp++ = '\\';
532 			*bp++ = input();
533 		} else {
534 			*bp++ = c;
535 		}
536 	}
537 	*bp = 0;
538 	if (c == 0)
539 		SYNTAX("non-terminated regular expression %.10s...", buf);
540 	yylval.s = tostring(buf);
541 	unput('/');
542 	RET(REGEXPR);
543 }
544 
545 /* low-level lexical stuff, sort of inherited from lex */
546 
547 char	ebuf[300];
548 char	*ep = ebuf;
549 char	yysbuf[100];	/* pushback buffer */
550 char	*yysptr = yysbuf;
551 FILE	*yyin = NULL;
552 
553 int input(void)	/* get next lexical input character */
554 {
555 	int c;
556 	extern char *lexprog;
557 
558 	if (yysptr > yysbuf)
559 		c = (uschar)*--yysptr;
560 	else if (lexprog != NULL) {	/* awk '...' */
561 		if ((c = (uschar)*lexprog) != 0)
562 			lexprog++;
563 	} else				/* awk -f ... */
564 		c = pgetc();
565 	if (c == EOF)
566 		c = 0;
567 	if (ep >= ebuf + sizeof ebuf)
568 		ep = ebuf;
569 	*ep = c;
570 	if (c != 0) {
571 		ep++;
572 	}
573 	return (c);
574 }
575 
576 void unput(int c)	/* put lexical character back on input */
577 {
578 	if (yysptr >= yysbuf + sizeof(yysbuf))
579 		FATAL("pushed back too much: %.20s...", yysbuf);
580 	*yysptr++ = c;
581 	if (--ep < ebuf)
582 		ep = ebuf + sizeof(ebuf) - 1;
583 }
584 
585 void unputstr(const char *s)	/* put a string back on input */
586 {
587 	int i;
588 
589 	for (i = strlen(s)-1; i >= 0; i--)
590 		unput(s[i]);
591 }
592