xref: /freebsd/contrib/one-true-awk/lex.c (revision 17d6c636720d00f77e5d098daf4c278f89d84f7b)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
31 
32 extern YYSTYPE	yylval;
33 extern int	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	char	*word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 Keyword keywords[] ={	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "atan2",	FATAN,		BLTIN },
51 	{ "break",	BREAK,		BREAK },
52 	{ "close",	CLOSE,		CLOSE },
53 	{ "continue",	CONTINUE,	CONTINUE },
54 	{ "cos",	FCOS,		BLTIN },
55 	{ "delete",	DELETE,		DELETE },
56 	{ "do",		DO,		DO },
57 	{ "else",	ELSE,		ELSE },
58 	{ "exit",	EXIT,		EXIT },
59 	{ "exp",	FEXP,		BLTIN },
60 	{ "fflush",	FFLUSH,		BLTIN },
61 	{ "for",	FOR,		FOR },
62 	{ "func",	FUNC,		FUNC },
63 	{ "function",	FUNC,		FUNC },
64 	{ "getline",	GETLINE,	GETLINE },
65 	{ "gsub",	GSUB,		GSUB },
66 	{ "if",		IF,		IF },
67 	{ "in",		IN,		IN },
68 	{ "index",	INDEX,		INDEX },
69 	{ "int",	FINT,		BLTIN },
70 	{ "length",	FLENGTH,	BLTIN },
71 	{ "log",	FLOG,		BLTIN },
72 	{ "match",	MATCHFCN,	MATCHFCN },
73 	{ "next",	NEXT,		NEXT },
74 	{ "nextfile",	NEXTFILE,	NEXTFILE },
75 	{ "print",	PRINT,		PRINT },
76 	{ "printf",	PRINTF,		PRINTF },
77 	{ "rand",	FRAND,		BLTIN },
78 	{ "return",	RETURN,		RETURN },
79 	{ "sin",	FSIN,		BLTIN },
80 	{ "split",	SPLIT,		SPLIT },
81 	{ "sprintf",	SPRINTF,	SPRINTF },
82 	{ "sqrt",	FSQRT,		BLTIN },
83 	{ "srand",	FSRAND,		BLTIN },
84 	{ "sub",	SUB,		SUB },
85 	{ "substr",	SUBSTR,		SUBSTR },
86 	{ "system",	FSYSTEM,	BLTIN },
87 	{ "tolower",	FTOLOWER,	BLTIN },
88 	{ "toupper",	FTOUPPER,	BLTIN },
89 	{ "while",	WHILE,		WHILE },
90 };
91 
92 #define DEBUG
93 #ifdef	DEBUG
94 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
95 #else
96 #define	RET(x)	return(x)
97 #endif
98 
99 int peek(void)
100 {
101 	int c = input();
102 	unput(c);
103 	return c;
104 }
105 
106 int gettok(char **pbuf, int *psz)	/* get next input token */
107 {
108 	int c;
109 	char *buf = *pbuf;
110 	int sz = *psz;
111 	char *bp = buf;
112 
113 	c = input();
114 	if (c == 0)
115 		return 0;
116 	buf[0] = c;
117 	buf[1] = 0;
118 	if (!isalnum(c) && c != '.' && c != '_')
119 		return c;
120 
121 	*bp++ = c;
122 	if (isalpha(c) || c == '_') {	/* it's a varname */
123 		for ( ; (c = input()) != 0; ) {
124 			if (bp-buf >= sz)
125 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
126 					FATAL( "out of space for name %.10s...", buf );
127 			if (isalnum(c) || c == '_')
128 				*bp++ = c;
129 			else {
130 				*bp = 0;
131 				unput(c);
132 				break;
133 			}
134 		}
135 		*bp = 0;
136 	} else {	/* it's a number */
137 		char *rem;
138 		/* read input until can't be a number */
139 		for ( ; (c = input()) != 0; ) {
140 			if (bp-buf >= sz)
141 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
142 					FATAL( "out of space for number %.10s...", buf );
143 			if (isdigit(c) || c == 'e' || c == 'E'
144 			  || c == '.' || c == '+' || c == '-')
145 				*bp++ = c;
146 			else {
147 				unput(c);
148 				break;
149 			}
150 		}
151 		*bp = 0;
152 		strtod(buf, &rem);	/* parse the number */
153 		unputstr(rem);		/* put rest back for later */
154 		rem[0] = 0;
155 	}
156 	*pbuf = buf;
157 	*psz = sz;
158 	return buf[0];
159 }
160 
161 int	word(char *);
162 int	string(void);
163 int	regexpr(void);
164 int	sc	= 0;	/* 1 => return a } right now */
165 int	reg	= 0;	/* 1 => return a REGEXPR now */
166 
167 int yylex(void)
168 {
169 	int c;
170 	static char *buf = 0;
171 	static int bufsize = 500;
172 
173 	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
174 		FATAL( "out of space in yylex" );
175 	if (sc) {
176 		sc = 0;
177 		RET('}');
178 	}
179 	if (reg) {
180 		reg = 0;
181 		return regexpr();
182 	}
183 	for (;;) {
184 		c = gettok(&buf, &bufsize);
185 		if (c == 0)
186 			return 0;
187 		if (isalpha(c) || c == '_')
188 			return word(buf);
189 		if (isdigit(c) || c == '.') {
190 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
191 			/* should this also have STR set? */
192 			RET(NUMBER);
193 		}
194 
195 		yylval.i = c;
196 		switch (c) {
197 		case '\n':	/* {EOL} */
198 			RET(NL);
199 		case '\r':	/* assume \n is coming */
200 		case ' ':	/* {WS}+ */
201 		case '\t':
202 			break;
203 		case '#':	/* #.* strip comments */
204 			while ((c = input()) != '\n' && c != 0)
205 				;
206 			unput(c);
207 			break;
208 		case ';':
209 			RET(';');
210 		case '\\':
211 			if (peek() == '\n') {
212 				input();
213 			} else if (peek() == '\r') {
214 				input(); input();	/* \n */
215 				lineno++;
216 			} else {
217 				RET(c);
218 			}
219 			break;
220 		case '&':
221 			if (peek() == '&') {
222 				input(); RET(AND);
223 			} else
224 				RET('&');
225 		case '|':
226 			if (peek() == '|') {
227 				input(); RET(BOR);
228 			} else
229 				RET('|');
230 		case '!':
231 			if (peek() == '=') {
232 				input(); yylval.i = NE; RET(NE);
233 			} else if (peek() == '~') {
234 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
235 			} else
236 				RET(NOT);
237 		case '~':
238 			yylval.i = MATCH;
239 			RET(MATCHOP);
240 		case '<':
241 			if (peek() == '=') {
242 				input(); yylval.i = LE; RET(LE);
243 			} else {
244 				yylval.i = LT; RET(LT);
245 			}
246 		case '=':
247 			if (peek() == '=') {
248 				input(); yylval.i = EQ; RET(EQ);
249 			} else {
250 				yylval.i = ASSIGN; RET(ASGNOP);
251 			}
252 		case '>':
253 			if (peek() == '=') {
254 				input(); yylval.i = GE; RET(GE);
255 			} else if (peek() == '>') {
256 				input(); yylval.i = APPEND; RET(APPEND);
257 			} else {
258 				yylval.i = GT; RET(GT);
259 			}
260 		case '+':
261 			if (peek() == '+') {
262 				input(); yylval.i = INCR; RET(INCR);
263 			} else if (peek() == '=') {
264 				input(); yylval.i = ADDEQ; RET(ASGNOP);
265 			} else
266 				RET('+');
267 		case '-':
268 			if (peek() == '-') {
269 				input(); yylval.i = DECR; RET(DECR);
270 			} else if (peek() == '=') {
271 				input(); yylval.i = SUBEQ; RET(ASGNOP);
272 			} else
273 				RET('-');
274 		case '*':
275 			if (peek() == '=') {	/* *= */
276 				input(); yylval.i = MULTEQ; RET(ASGNOP);
277 			} else if (peek() == '*') {	/* ** or **= */
278 				input();	/* eat 2nd * */
279 				if (peek() == '=') {
280 					input(); yylval.i = POWEQ; RET(ASGNOP);
281 				} else {
282 					RET(POWER);
283 				}
284 			} else
285 				RET('*');
286 		case '/':
287 			RET('/');
288 		case '%':
289 			if (peek() == '=') {
290 				input(); yylval.i = MODEQ; RET(ASGNOP);
291 			} else
292 				RET('%');
293 		case '^':
294 			if (peek() == '=') {
295 				input(); yylval.i = POWEQ; RET(ASGNOP);
296 			} else
297 				RET(POWER);
298 
299 		case '$':
300 			/* BUG: awkward, if not wrong */
301 			c = gettok(&buf, &bufsize);
302 			if (isalpha(c)) {
303 				if (strcmp(buf, "NF") == 0) {	/* very special */
304 					unputstr("(NF)");
305 					RET(INDIRECT);
306 				}
307 				c = peek();
308 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
309 					unputstr(buf);
310 					RET(INDIRECT);
311 				}
312 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
313 				RET(IVAR);
314 			} else {
315 				unputstr(buf);
316 				RET(INDIRECT);
317 			}
318 
319 		case '}':
320 			if (--bracecnt < 0)
321 				SYNTAX( "extra }" );
322 			sc = 1;
323 			RET(';');
324 		case ']':
325 			if (--brackcnt < 0)
326 				SYNTAX( "extra ]" );
327 			RET(']');
328 		case ')':
329 			if (--parencnt < 0)
330 				SYNTAX( "extra )" );
331 			RET(')');
332 		case '{':
333 			bracecnt++;
334 			RET('{');
335 		case '[':
336 			brackcnt++;
337 			RET('[');
338 		case '(':
339 			parencnt++;
340 			RET('(');
341 
342 		case '"':
343 			return string();	/* BUG: should be like tran.c ? */
344 
345 		default:
346 			RET(c);
347 		}
348 	}
349 }
350 
351 int string(void)
352 {
353 	int c, n;
354 	char *s, *bp;
355 	static char *buf = 0;
356 	static int bufsz = 500;
357 
358 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
359 		FATAL("out of space for strings");
360 	for (bp = buf; (c = input()) != '"'; ) {
361 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
362 			FATAL("out of space for string %.10s...", buf);
363 		switch (c) {
364 		case '\n':
365 		case '\r':
366 		case 0:
367 			SYNTAX( "non-terminated string %.10s...", buf );
368 			lineno++;
369 			break;
370 		case '\\':
371 			c = input();
372 			switch (c) {
373 			case '"': *bp++ = '"'; break;
374 			case 'n': *bp++ = '\n'; break;
375 			case 't': *bp++ = '\t'; break;
376 			case 'f': *bp++ = '\f'; break;
377 			case 'r': *bp++ = '\r'; break;
378 			case 'b': *bp++ = '\b'; break;
379 			case 'v': *bp++ = '\v'; break;
380 			case 'a': *bp++ = '\007'; break;
381 			case '\\': *bp++ = '\\'; break;
382 
383 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
384 			case '3': case '4': case '5': case '6': case '7':
385 				n = c - '0';
386 				if ((c = peek()) >= '0' && c < '8') {
387 					n = 8 * n + input() - '0';
388 					if ((c = peek()) >= '0' && c < '8')
389 						n = 8 * n + input() - '0';
390 				}
391 				*bp++ = n;
392 				break;
393 
394 			case 'x':	/* hex  \x0-9a-fA-F + */
395 			    {	char xbuf[100], *px;
396 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
397 					if (isdigit(c)
398 					 || (c >= 'a' && c <= 'f')
399 					 || (c >= 'A' && c <= 'F'))
400 						*px++ = c;
401 					else
402 						break;
403 				}
404 				*px = 0;
405 				unput(c);
406 	  			sscanf(xbuf, "%x", &n);
407 				*bp++ = n;
408 				break;
409 			    }
410 
411 			default:
412 				*bp++ = c;
413 				break;
414 			}
415 			break;
416 		default:
417 			*bp++ = c;
418 			break;
419 		}
420 	}
421 	*bp = 0;
422 	s = tostring(buf);
423 	*bp++ = ' '; *bp++ = 0;
424 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
425 	RET(STRING);
426 }
427 
428 
429 int binsearch(char *w, Keyword *kp, int n)
430 {
431 	int cond, low, mid, high;
432 
433 	low = 0;
434 	high = n - 1;
435 	while (low <= high) {
436 		mid = (low + high) / 2;
437 		if ((cond = strcmp(w, kp[mid].word)) < 0)
438 			high = mid - 1;
439 		else if (cond > 0)
440 			low = mid + 1;
441 		else
442 			return mid;
443 	}
444 	return -1;
445 }
446 
447 int word(char *w)
448 {
449 	Keyword *kp;
450 	int c, n;
451 
452 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
453 	kp = keywords + n;
454 	if (n != -1) {	/* found in table */
455 		yylval.i = kp->sub;
456 		switch (kp->type) {	/* special handling */
457 		case FSYSTEM:
458 			if (safe)
459 				SYNTAX( "system is unsafe" );
460 			RET(kp->type);
461 		case FUNC:
462 			if (infunc)
463 				SYNTAX( "illegal nested function" );
464 			RET(kp->type);
465 		case RETURN:
466 			if (!infunc)
467 				SYNTAX( "return not in function" );
468 			RET(kp->type);
469 		case VARNF:
470 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
471 			RET(VARNF);
472 		default:
473 			RET(kp->type);
474 		}
475 	}
476 	c = peek();	/* look for '(' */
477 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
478 		yylval.i = n;
479 		RET(ARG);
480 	} else {
481 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
482 		if (c == '(') {
483 			RET(CALL);
484 		} else {
485 			RET(VAR);
486 		}
487 	}
488 }
489 
490 void startreg(void)	/* next call to yyles will return a regular expression */
491 {
492 	reg = 1;
493 }
494 
495 int regexpr(void)
496 {
497 	int c;
498 	static char *buf = 0;
499 	static int bufsz = 500;
500 	char *bp;
501 
502 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
503 		FATAL("out of space for rex expr");
504 	bp = buf;
505 	for ( ; (c = input()) != '/' && c != 0; ) {
506 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
507 			FATAL("out of space for reg expr %.10s...", buf);
508 		if (c == '\n') {
509 			SYNTAX( "newline in regular expression %.10s...", buf );
510 			unput('\n');
511 			break;
512 		} else if (c == '\\') {
513 			*bp++ = '\\';
514 			*bp++ = input();
515 		} else {
516 			*bp++ = c;
517 		}
518 	}
519 	*bp = 0;
520 	yylval.s = tostring(buf);
521 	unput('/');
522 	RET(REGEXPR);
523 }
524 
525 /* low-level lexical stuff, sort of inherited from lex */
526 
527 char	ebuf[300];
528 char	*ep = ebuf;
529 char	yysbuf[100];	/* pushback buffer */
530 char	*yysptr = yysbuf;
531 FILE	*yyin = 0;
532 
533 int input(void)	/* get next lexical input character */
534 {
535 	int c;
536 	extern char *lexprog;
537 
538 	if (yysptr > yysbuf)
539 		c = *--yysptr;
540 	else if (lexprog != NULL) {	/* awk '...' */
541 		if ((c = *lexprog) != 0)
542 			lexprog++;
543 	} else				/* awk -f ... */
544 		c = pgetc();
545 	if (c == '\n')
546 		lineno++;
547 	else if (c == EOF)
548 		c = 0;
549 	if (ep >= ebuf + sizeof ebuf)
550 		ep = ebuf;
551 	return *ep++ = c;
552 }
553 
554 void unput(int c)	/* put lexical character back on input */
555 {
556 	if (c == '\n')
557 		lineno--;
558 	if (yysptr >= yysbuf + sizeof(yysbuf))
559 		FATAL("pushed back too much: %.20s...", yysbuf);
560 	*yysptr++ = c;
561 	if (--ep < ebuf)
562 		ep = ebuf + sizeof(ebuf) - 1;
563 }
564 
565 void unputstr(char *s)	/* put a string back on input */
566 {
567 	int i;
568 
569 	for (i = strlen(s)-1; i >= 0; i--)
570 		unput(s[i]);
571 }
572