xref: /freebsd/contrib/one-true-awk/lex.c (revision d056fa046c6a91b90cd98165face0e42a33a5173)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
31 
32 extern YYSTYPE	yylval;
33 extern int	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 Keyword keywords[] ={	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "atan2",	FATAN,		BLTIN },
51 	{ "break",	BREAK,		BREAK },
52 	{ "close",	CLOSE,		CLOSE },
53 	{ "continue",	CONTINUE,	CONTINUE },
54 	{ "cos",	FCOS,		BLTIN },
55 	{ "delete",	DELETE,		DELETE },
56 	{ "do",		DO,		DO },
57 	{ "else",	ELSE,		ELSE },
58 	{ "exit",	EXIT,		EXIT },
59 	{ "exp",	FEXP,		BLTIN },
60 	{ "fflush",	FFLUSH,		BLTIN },
61 	{ "for",	FOR,		FOR },
62 	{ "func",	FUNC,		FUNC },
63 	{ "function",	FUNC,		FUNC },
64 	{ "getline",	GETLINE,	GETLINE },
65 	{ "gsub",	GSUB,		GSUB },
66 	{ "if",		IF,		IF },
67 	{ "in",		IN,		IN },
68 	{ "index",	INDEX,		INDEX },
69 	{ "int",	FINT,		BLTIN },
70 	{ "length",	FLENGTH,	BLTIN },
71 	{ "log",	FLOG,		BLTIN },
72 	{ "match",	MATCHFCN,	MATCHFCN },
73 	{ "next",	NEXT,		NEXT },
74 	{ "nextfile",	NEXTFILE,	NEXTFILE },
75 	{ "print",	PRINT,		PRINT },
76 	{ "printf",	PRINTF,		PRINTF },
77 	{ "rand",	FRAND,		BLTIN },
78 	{ "return",	RETURN,		RETURN },
79 	{ "sin",	FSIN,		BLTIN },
80 	{ "split",	SPLIT,		SPLIT },
81 	{ "sprintf",	SPRINTF,	SPRINTF },
82 	{ "sqrt",	FSQRT,		BLTIN },
83 	{ "srand",	FSRAND,		BLTIN },
84 	{ "sub",	SUB,		SUB },
85 	{ "substr",	SUBSTR,		SUBSTR },
86 	{ "system",	FSYSTEM,	BLTIN },
87 	{ "tolower",	FTOLOWER,	BLTIN },
88 	{ "toupper",	FTOUPPER,	BLTIN },
89 	{ "while",	WHILE,		WHILE },
90 };
91 
92 #define DEBUG
93 #ifdef	DEBUG
94 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
95 #else
96 #define	RET(x)	return(x)
97 #endif
98 
99 int peek(void)
100 {
101 	int c = input();
102 	unput(c);
103 	return c;
104 }
105 
106 int gettok(char **pbuf, int *psz)	/* get next input token */
107 {
108 	int c, retc;
109 	char *buf = *pbuf;
110 	int sz = *psz;
111 	char *bp = buf;
112 
113 	c = input();
114 	if (c == 0)
115 		return 0;
116 	buf[0] = c;
117 	buf[1] = 0;
118 	if (!isalnum(c) && c != '.' && c != '_')
119 		return c;
120 
121 	*bp++ = c;
122 	if (isalpha(c) || c == '_') {	/* it's a varname */
123 		for ( ; (c = input()) != 0; ) {
124 			if (bp-buf >= sz)
125 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
126 					FATAL( "out of space for name %.10s...", buf );
127 			if (isalnum(c) || c == '_')
128 				*bp++ = c;
129 			else {
130 				*bp = 0;
131 				unput(c);
132 				break;
133 			}
134 		}
135 		*bp = 0;
136 		retc = 'a';	/* alphanumeric */
137 	} else {	/* maybe it's a number, but could be . */
138 		char *rem;
139 		/* read input until can't be a number */
140 		for ( ; (c = input()) != 0; ) {
141 			if (bp-buf >= sz)
142 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
143 					FATAL( "out of space for number %.10s...", buf );
144 			if (isdigit(c) || c == 'e' || c == 'E'
145 			  || c == '.' || c == '+' || c == '-')
146 				*bp++ = c;
147 			else {
148 				unput(c);
149 				break;
150 			}
151 		}
152 		*bp = 0;
153 		strtod(buf, &rem);	/* parse the number */
154 		if (rem == buf) {	/* it wasn't a valid number at all */
155 			buf[1] = 0;	/* return one character as token */
156 			retc = buf[0];	/* character is its own type */
157 			unputstr(rem+1); /* put rest back for later */
158 		} else {	/* some prefix was a number */
159 			unputstr(rem);	/* put rest back for later */
160 			rem[0] = 0;	/* truncate buf after number part */
161 			retc = '0';	/* type is number */
162 		}
163 	}
164 	*pbuf = buf;
165 	*psz = sz;
166 	return retc;
167 }
168 
169 int	word(char *);
170 int	string(void);
171 int	regexpr(void);
172 int	sc	= 0;	/* 1 => return a } right now */
173 int	reg	= 0;	/* 1 => return a REGEXPR now */
174 
175 int yylex(void)
176 {
177 	int c;
178 	static char *buf = 0;
179 	static int bufsize = 500;
180 
181 	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
182 		FATAL( "out of space in yylex" );
183 	if (sc) {
184 		sc = 0;
185 		RET('}');
186 	}
187 	if (reg) {
188 		reg = 0;
189 		return regexpr();
190 	}
191 /* printf("top\n"); */
192 	for (;;) {
193 		c = gettok(&buf, &bufsize);
194 /* printf("gettok [%s]\n", buf); */
195 		if (c == 0)
196 			return 0;
197 		if (isalpha(c) || c == '_')
198 			return word(buf);
199 		if (isdigit(c)) {
200 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
201 			/* should this also have STR set? */
202 			RET(NUMBER);
203 		}
204 
205 		yylval.i = c;
206 		switch (c) {
207 		case '\n':	/* {EOL} */
208 			RET(NL);
209 		case '\r':	/* assume \n is coming */
210 		case ' ':	/* {WS}+ */
211 		case '\t':
212 			break;
213 		case '#':	/* #.* strip comments */
214 			while ((c = input()) != '\n' && c != 0)
215 				;
216 			unput(c);
217 			break;
218 		case ';':
219 			RET(';');
220 		case '\\':
221 			if (peek() == '\n') {
222 				input();
223 			} else if (peek() == '\r') {
224 				input(); input();	/* \n */
225 				lineno++;
226 			} else {
227 				RET(c);
228 			}
229 			break;
230 		case '&':
231 			if (peek() == '&') {
232 				input(); RET(AND);
233 			} else
234 				RET('&');
235 		case '|':
236 			if (peek() == '|') {
237 				input(); RET(BOR);
238 			} else
239 				RET('|');
240 		case '!':
241 			if (peek() == '=') {
242 				input(); yylval.i = NE; RET(NE);
243 			} else if (peek() == '~') {
244 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
245 			} else
246 				RET(NOT);
247 		case '~':
248 			yylval.i = MATCH;
249 			RET(MATCHOP);
250 		case '<':
251 			if (peek() == '=') {
252 				input(); yylval.i = LE; RET(LE);
253 			} else {
254 				yylval.i = LT; RET(LT);
255 			}
256 		case '=':
257 			if (peek() == '=') {
258 				input(); yylval.i = EQ; RET(EQ);
259 			} else {
260 				yylval.i = ASSIGN; RET(ASGNOP);
261 			}
262 		case '>':
263 			if (peek() == '=') {
264 				input(); yylval.i = GE; RET(GE);
265 			} else if (peek() == '>') {
266 				input(); yylval.i = APPEND; RET(APPEND);
267 			} else {
268 				yylval.i = GT; RET(GT);
269 			}
270 		case '+':
271 			if (peek() == '+') {
272 				input(); yylval.i = INCR; RET(INCR);
273 			} else if (peek() == '=') {
274 				input(); yylval.i = ADDEQ; RET(ASGNOP);
275 			} else
276 				RET('+');
277 		case '-':
278 			if (peek() == '-') {
279 				input(); yylval.i = DECR; RET(DECR);
280 			} else if (peek() == '=') {
281 				input(); yylval.i = SUBEQ; RET(ASGNOP);
282 			} else
283 				RET('-');
284 		case '*':
285 			if (peek() == '=') {	/* *= */
286 				input(); yylval.i = MULTEQ; RET(ASGNOP);
287 			} else if (peek() == '*') {	/* ** or **= */
288 				input();	/* eat 2nd * */
289 				if (peek() == '=') {
290 					input(); yylval.i = POWEQ; RET(ASGNOP);
291 				} else {
292 					RET(POWER);
293 				}
294 			} else
295 				RET('*');
296 		case '/':
297 			RET('/');
298 		case '%':
299 			if (peek() == '=') {
300 				input(); yylval.i = MODEQ; RET(ASGNOP);
301 			} else
302 				RET('%');
303 		case '^':
304 			if (peek() == '=') {
305 				input(); yylval.i = POWEQ; RET(ASGNOP);
306 			} else
307 				RET(POWER);
308 
309 		case '$':
310 			/* BUG: awkward, if not wrong */
311 			c = gettok(&buf, &bufsize);
312 			if (isalpha(c)) {
313 				if (strcmp(buf, "NF") == 0) {	/* very special */
314 					unputstr("(NF)");
315 					RET(INDIRECT);
316 				}
317 				c = peek();
318 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
319 					unputstr(buf);
320 					RET(INDIRECT);
321 				}
322 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
323 				RET(IVAR);
324 			} else if (c == 0) {	/*  */
325 				SYNTAX( "unexpected end of input after $" );
326 				RET(';');
327 			} else {
328 				unputstr(buf);
329 				RET(INDIRECT);
330 			}
331 
332 		case '}':
333 			if (--bracecnt < 0)
334 				SYNTAX( "extra }" );
335 			sc = 1;
336 			RET(';');
337 		case ']':
338 			if (--brackcnt < 0)
339 				SYNTAX( "extra ]" );
340 			RET(']');
341 		case ')':
342 			if (--parencnt < 0)
343 				SYNTAX( "extra )" );
344 			RET(')');
345 		case '{':
346 			bracecnt++;
347 			RET('{');
348 		case '[':
349 			brackcnt++;
350 			RET('[');
351 		case '(':
352 			parencnt++;
353 			RET('(');
354 
355 		case '"':
356 			return string();	/* BUG: should be like tran.c ? */
357 
358 		default:
359 			RET(c);
360 		}
361 	}
362 }
363 
364 int string(void)
365 {
366 	int c, n;
367 	char *s, *bp;
368 	static char *buf = 0;
369 	static int bufsz = 500;
370 
371 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
372 		FATAL("out of space for strings");
373 	for (bp = buf; (c = input()) != '"'; ) {
374 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
375 			FATAL("out of space for string %.10s...", buf);
376 		switch (c) {
377 		case '\n':
378 		case '\r':
379 		case 0:
380 			SYNTAX( "non-terminated string %.10s...", buf );
381 			lineno++;
382 			if (c == 0)	/* hopeless */
383 				FATAL( "giving up" );
384 			break;
385 		case '\\':
386 			c = input();
387 			switch (c) {
388 			case '"': *bp++ = '"'; break;
389 			case 'n': *bp++ = '\n'; break;
390 			case 't': *bp++ = '\t'; break;
391 			case 'f': *bp++ = '\f'; break;
392 			case 'r': *bp++ = '\r'; break;
393 			case 'b': *bp++ = '\b'; break;
394 			case 'v': *bp++ = '\v'; break;
395 			case 'a': *bp++ = '\007'; break;
396 			case '\\': *bp++ = '\\'; break;
397 
398 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
399 			case '3': case '4': case '5': case '6': case '7':
400 				n = c - '0';
401 				if ((c = peek()) >= '0' && c < '8') {
402 					n = 8 * n + input() - '0';
403 					if ((c = peek()) >= '0' && c < '8')
404 						n = 8 * n + input() - '0';
405 				}
406 				*bp++ = n;
407 				break;
408 
409 			case 'x':	/* hex  \x0-9a-fA-F + */
410 			    {	char xbuf[100], *px;
411 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
412 					if (isdigit(c)
413 					 || (c >= 'a' && c <= 'f')
414 					 || (c >= 'A' && c <= 'F'))
415 						*px++ = c;
416 					else
417 						break;
418 				}
419 				*px = 0;
420 				unput(c);
421 	  			sscanf(xbuf, "%x", &n);
422 				*bp++ = n;
423 				break;
424 			    }
425 
426 			default:
427 				*bp++ = c;
428 				break;
429 			}
430 			break;
431 		default:
432 			*bp++ = c;
433 			break;
434 		}
435 	}
436 	*bp = 0;
437 	s = tostring(buf);
438 	*bp++ = ' '; *bp++ = 0;
439 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
440 	RET(STRING);
441 }
442 
443 
444 int binsearch(char *w, Keyword *kp, int n)
445 {
446 	int cond, low, mid, high;
447 
448 	low = 0;
449 	high = n - 1;
450 	while (low <= high) {
451 		mid = (low + high) / 2;
452 		if ((cond = strcmp(w, kp[mid].word)) < 0)
453 			high = mid - 1;
454 		else if (cond > 0)
455 			low = mid + 1;
456 		else
457 			return mid;
458 	}
459 	return -1;
460 }
461 
462 int word(char *w)
463 {
464 	Keyword *kp;
465 	int c, n;
466 
467 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
468 	kp = keywords + n;
469 	if (n != -1) {	/* found in table */
470 		yylval.i = kp->sub;
471 		switch (kp->type) {	/* special handling */
472 		case FSYSTEM:
473 			if (safe)
474 				SYNTAX( "system is unsafe" );
475 			RET(kp->type);
476 		case FUNC:
477 			if (infunc)
478 				SYNTAX( "illegal nested function" );
479 			RET(kp->type);
480 		case RETURN:
481 			if (!infunc)
482 				SYNTAX( "return not in function" );
483 			RET(kp->type);
484 		case VARNF:
485 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
486 			RET(VARNF);
487 		default:
488 			RET(kp->type);
489 		}
490 	}
491 	c = peek();	/* look for '(' */
492 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
493 		yylval.i = n;
494 		RET(ARG);
495 	} else {
496 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
497 		if (c == '(') {
498 			RET(CALL);
499 		} else {
500 			RET(VAR);
501 		}
502 	}
503 }
504 
505 void startreg(void)	/* next call to yylex will return a regular expression */
506 {
507 	reg = 1;
508 }
509 
510 int regexpr(void)
511 {
512 	int c;
513 	static char *buf = 0;
514 	static int bufsz = 500;
515 	char *bp;
516 
517 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
518 		FATAL("out of space for rex expr");
519 	bp = buf;
520 	for ( ; (c = input()) != '/' && c != 0; ) {
521 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
522 			FATAL("out of space for reg expr %.10s...", buf);
523 		if (c == '\n') {
524 			SYNTAX( "newline in regular expression %.10s...", buf );
525 			unput('\n');
526 			break;
527 		} else if (c == '\\') {
528 			*bp++ = '\\';
529 			*bp++ = input();
530 		} else {
531 			*bp++ = c;
532 		}
533 	}
534 	*bp = 0;
535 	if (c == 0)
536 		SYNTAX("non-terminated regular expression %.10s...", buf);
537 	yylval.s = tostring(buf);
538 	unput('/');
539 	RET(REGEXPR);
540 }
541 
542 /* low-level lexical stuff, sort of inherited from lex */
543 
544 char	ebuf[300];
545 char	*ep = ebuf;
546 char	yysbuf[100];	/* pushback buffer */
547 char	*yysptr = yysbuf;
548 FILE	*yyin = 0;
549 
550 int input(void)	/* get next lexical input character */
551 {
552 	int c;
553 	extern char *lexprog;
554 
555 	if (yysptr > yysbuf)
556 		c = (uschar)*--yysptr;
557 	else if (lexprog != NULL) {	/* awk '...' */
558 		if ((c = (uschar)*lexprog) != 0)
559 			lexprog++;
560 	} else				/* awk -f ... */
561 		c = pgetc();
562 	if (c == '\n')
563 		lineno++;
564 	else if (c == EOF)
565 		c = 0;
566 	if (ep >= ebuf + sizeof ebuf)
567 		ep = ebuf;
568 	return *ep++ = c;
569 }
570 
571 void unput(int c)	/* put lexical character back on input */
572 {
573 	if (c == '\n')
574 		lineno--;
575 	if (yysptr >= yysbuf + sizeof(yysbuf))
576 		FATAL("pushed back too much: %.20s...", yysbuf);
577 	*yysptr++ = c;
578 	if (--ep < ebuf)
579 		ep = ebuf + sizeof(ebuf) - 1;
580 }
581 
582 void unputstr(const char *s)	/* put a string back on input */
583 {
584 	int i;
585 
586 	for (i = strlen(s)-1; i >= 0; i--)
587 		unput(s[i]);
588 }
589