1 /*
2 * Copyright (C) Lucent Technologies 1997
3 * All Rights Reserved
4 *
5 * Permission to use, copy, modify, and distribute this software and
6 * its documentation for any purpose and without fee is hereby
7 * granted, provided that the above copyright notice appear in all
8 * copies and that both that the copyright notice and this
9 * permission notice and warranty disclaimer appear in supporting
10 * documentation, and that the name Lucent Technologies or any of
11 * its entities not be used in advertising or publicity pertaining
12 * to distribution of the software without specific, written prior
13 * permission.
14 *
15 * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 * THIS SOFTWARE.
23 */
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "y.tab.h"
31
32 extern YYSTYPE yylval;
33 extern int infunc;
34
35 off_t lineno = 1;
36 int bracecnt = 0;
37 int brackcnt = 0;
38 int parencnt = 0;
39
40 typedef struct Keyword {
41 const char *word;
42 int sub;
43 int type;
44 } Keyword;
45
46 Keyword keywords[] = { /* keep sorted: binary searched */
47 { "BEGIN", XBEGIN, XBEGIN },
48 { "END", XEND, XEND },
49 { "NF", VARNF, VARNF },
50 { "atan2", FATAN, BLTIN },
51 { "break", BREAK, BREAK },
52 { "close", CLOSE, CLOSE },
53 { "continue", CONTINUE, CONTINUE },
54 { "cos", FCOS, BLTIN },
55 { "delete", DELETE, DELETE },
56 { "do", DO, DO },
57 { "else", ELSE, ELSE },
58 { "exit", EXIT, EXIT },
59 { "exp", FEXP, BLTIN },
60 { "fflush", FFLUSH, BLTIN },
61 { "for", FOR, FOR },
62 { "func", FUNC, FUNC },
63 { "function", FUNC, FUNC },
64 { "getline", GETLINE, GETLINE },
65 { "gsub", GSUB, GSUB },
66 { "if", IF, IF },
67 { "in", IN, IN },
68 { "index", INDEX, INDEX },
69 { "int", FINT, BLTIN },
70 { "length", FLENGTH, BLTIN },
71 { "log", FLOG, BLTIN },
72 { "match", MATCHFCN, MATCHFCN },
73 { "next", NEXT, NEXT },
74 { "nextfile", NEXTFILE, NEXTFILE },
75 { "print", PRINT, PRINT },
76 { "printf", PRINTF, PRINTF },
77 { "rand", FRAND, BLTIN },
78 { "return", RETURN, RETURN },
79 { "sin", FSIN, BLTIN },
80 { "split", SPLIT, SPLIT },
81 { "sprintf", SPRINTF, SPRINTF },
82 { "sqrt", FSQRT, BLTIN },
83 { "srand", FSRAND, BLTIN },
84 { "sub", SUB, SUB },
85 { "substr", SUBSTR, SUBSTR },
86 { "system", FSYSTEM, BLTIN },
87 { "tolower", FTOLOWER, BLTIN },
88 { "toupper", FTOUPPER, BLTIN },
89 { "while", WHILE, WHILE },
90 };
91
92 #define RET(x) { if (dbg) (void) printf("lex %s\n", tokname(x)); return (x); }
93
94 int
peek(void)95 peek(void)
96 {
97 int c = input();
98 unput(c);
99 return (c);
100 }
101
102 int
gettok(char ** pbuf,size_t * psz)103 gettok(char **pbuf, size_t *psz) /* get next input token */
104 {
105 int c, retc;
106 char *buf = *pbuf;
107 size_t sz = *psz;
108 char *bp = buf;
109
110 c = input();
111 if (c == 0)
112 return (0);
113 buf[0] = c;
114 buf[1] = 0;
115 if (!isalnum(c) && c != '.' && c != '_')
116 return (c);
117
118 *bp++ = c;
119 if (isalpha(c) || c == '_') { /* it's a varname */
120 for (; (c = input()) != 0; ) {
121 if (bp-buf >= sz &&
122 !adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
123 FATAL("out of space for name %.10s...", buf);
124 if (isalnum(c) || c == '_')
125 *bp++ = c;
126 else {
127 *bp = 0;
128 unput(c);
129 break;
130 }
131 }
132 *bp = 0;
133 retc = 'a'; /* alphanumeric */
134 } else { /* maybe it's a number, but could be . */
135 char *rem;
136 /* read input until can't be a number */
137 for (; (c = input()) != 0; ) {
138 if (bp-buf >= sz &&
139 !adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
140 FATAL("out of space for number %.10s...", buf);
141 if (isdigit(c) || c == 'e' || c == 'E' ||
142 c == '.' || c == '+' || c == '-')
143 *bp++ = c;
144 else {
145 unput(c);
146 break;
147 }
148 }
149 *bp = 0;
150 (void) strtod(buf, &rem); /* parse the number */
151 if (rem == buf) { /* it wasn't a valid number at all */
152 buf[1] = 0; /* return one character as token */
153 retc = buf[0]; /* character is its own type */
154 unputstr(rem+1); /* put rest back for later */
155 } else { /* some prefix was a number */
156 unputstr(rem); /* put rest back for later */
157 rem[0] = 0; /* truncate buf after number part */
158 retc = '0'; /* type is number */
159 }
160 }
161 *pbuf = buf;
162 *psz = sz;
163 return (retc);
164 }
165
166 int word(char *);
167 int string(void);
168 int regexpr(void);
169 int sc = 0; /* 1 => return a } right now */
170 int reg = 0; /* 1 => return a REGEXPR now */
171
172 int
yylex(void)173 yylex(void)
174 {
175 int c;
176 static char *buf = NULL;
177 /* BUG: setting this small causes core dump! */
178 static size_t bufsize = 5;
179
180 if (buf == NULL && (buf = (char *)malloc(bufsize)) == NULL)
181 FATAL("out of space in yylex");
182 if (sc) {
183 sc = 0;
184 RET('}');
185 }
186 if (reg) {
187 reg = 0;
188 return (regexpr());
189 }
190 for (;;) {
191 c = gettok(&buf, &bufsize);
192 if (c == 0)
193 return (0);
194 if (isalpha(c) || c == '_')
195 return (word(buf));
196 if (isdigit(c)) {
197 yylval.cp = setsymtab(
198 buf, tostring(buf), atof(buf), CON|NUM, symtab);
199 /* should this also have STR set? */
200 RET(NUMBER);
201 }
202
203 yylval.i = c;
204 switch (c) {
205 case '\n': /* {EOL} */
206 lineno++;
207 RET(NL);
208 case '\r': /* assume \n is coming */
209 case ' ': /* {WS}+ */
210 case '\t':
211 break;
212 case '#': /* #.* strip comments */
213 while ((c = input()) != '\n' && c != 0)
214 ;
215 unput(c);
216 break;
217 case ';':
218 RET(';');
219 case '\\':
220 if (peek() == '\n') {
221 (void) input();
222 lineno++;
223 } else if (peek() == '\r') {
224 (void) input();
225 (void) input(); /* BUG: check for \n */
226 lineno++;
227 } else {
228 RET(c);
229 }
230 break;
231 case '&':
232 if (peek() == '&') {
233 (void) input();
234 RET(AND);
235 } else
236 RET('&');
237 case '|':
238 if (peek() == '|') {
239 (void) input();
240 RET(BOR);
241 } else
242 RET('|');
243 case '!':
244 if (peek() == '=') {
245 (void) input();
246 yylval.i = NE;
247 RET(NE);
248 } else if (peek() == '~') {
249 (void) input();
250 yylval.i = NOTMATCH;
251 RET(MATCHOP);
252 } else
253 RET(NOT);
254 case '~':
255 yylval.i = MATCH;
256 RET(MATCHOP);
257 case '<':
258 if (peek() == '=') {
259 (void) input();
260 yylval.i = LE;
261 RET(LE);
262 } else {
263 yylval.i = LT;
264 RET(LT);
265 }
266 case '=':
267 if (peek() == '=') {
268 (void) input();
269 yylval.i = EQ;
270 RET(EQ);
271 } else {
272 yylval.i = ASSIGN;
273 RET(ASGNOP);
274 }
275 case '>':
276 if (peek() == '=') {
277 (void) input();
278 yylval.i = GE;
279 RET(GE);
280 } else if (peek() == '>') {
281 (void) input();
282 yylval.i = APPEND;
283 RET(APPEND);
284 } else {
285 yylval.i = GT;
286 RET(GT);
287 }
288 case '+':
289 if (peek() == '+') {
290 (void) input();
291 yylval.i = INCR;
292 RET(INCR);
293 } else if (peek() == '=') {
294 (void) input();
295 yylval.i = ADDEQ;
296 RET(ASGNOP);
297 } else
298 RET('+');
299 case '-':
300 if (peek() == '-') {
301 (void) input();
302 yylval.i = DECR;
303 RET(DECR);
304 } else if (peek() == '=') {
305 (void) input();
306 yylval.i = SUBEQ;
307 RET(ASGNOP);
308 } else
309 RET('-');
310 case '*':
311 if (peek() == '=') { /* *= */
312 (void) input();
313 yylval.i = MULTEQ;
314 RET(ASGNOP);
315 } else if (peek() == '*') { /* ** or **= */
316 (void) input(); /* eat 2nd * */
317 if (peek() == '=') {
318 (void) input();
319 yylval.i = POWEQ;
320 RET(ASGNOP);
321 } else {
322 RET(POWER);
323 }
324 } else
325 RET('*');
326 case '/':
327 RET('/');
328 case '%':
329 if (peek() == '=') {
330 (void) input();
331 yylval.i = MODEQ;
332 RET(ASGNOP);
333 } else
334 RET('%');
335 case '^':
336 if (peek() == '=') {
337 (void) input();
338 yylval.i = POWEQ;
339 RET(ASGNOP);
340 } else
341 RET(POWER);
342
343 case '$':
344 /* BUG: awkward, if not wrong */
345 c = gettok(&buf, &bufsize);
346 if (isalpha(c)) {
347 if (strcmp(buf, "NF") == 0) {
348 /* very special */
349 unputstr("(NF)");
350 RET(INDIRECT);
351 }
352 c = peek();
353 if (c == '(' || c == '[' ||
354 (infunc && isarg(buf) >= 0)) {
355 unputstr(buf);
356 RET(INDIRECT);
357 }
358 yylval.cp = setsymtab(
359 buf, "", 0.0, STR|NUM, symtab);
360 RET(IVAR);
361 } else if (c == 0) { /* */
362 SYNTAX("unexpected end of input after $");
363 RET(';');
364 } else {
365 unputstr(buf);
366 RET(INDIRECT);
367 }
368
369 case '}':
370 if (--bracecnt < 0)
371 SYNTAX("extra }");
372 sc = 1;
373 RET(';');
374 case ']':
375 if (--brackcnt < 0)
376 SYNTAX("extra ]");
377 RET(']');
378 case ')':
379 if (--parencnt < 0)
380 SYNTAX("extra )");
381 RET(')');
382 case '{':
383 bracecnt++;
384 RET('{');
385 case '[':
386 brackcnt++;
387 RET('[');
388 case '(':
389 parencnt++;
390 RET('(');
391
392 case '"':
393 /* BUG: should be like tran.c ? */
394 return (string());
395
396 default:
397 RET(c);
398 }
399 }
400 }
401
402 int
string(void)403 string(void)
404 {
405 int c, n;
406 char *s, *bp;
407 static char *buf = NULL;
408 static size_t bufsz = 500;
409
410 if (buf == NULL && (buf = (char *)malloc(bufsz)) == NULL)
411 FATAL("out of space for strings");
412 for (bp = buf; (c = input()) != '"'; ) {
413 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
414 FATAL("out of space for string %.10s...", buf);
415 switch (c) {
416 case '\n':
417 case '\r':
418 case 0:
419 *bp = '\0';
420 SYNTAX("non-terminated string %.10s...", buf);
421 if (c == 0) /* hopeless */
422 FATAL("giving up");
423 lineno++;
424 break;
425 case '\\':
426 c = input();
427 switch (c) {
428 case '"': *bp++ = '"'; break;
429 case 'n': *bp++ = '\n'; break;
430 case 't': *bp++ = '\t'; break;
431 case 'f': *bp++ = '\f'; break;
432 case 'r': *bp++ = '\r'; break;
433 case 'b': *bp++ = '\b'; break;
434 case 'v': *bp++ = '\v'; break;
435 case 'a': *bp++ = '\007'; break;
436 case '\\': *bp++ = '\\'; break;
437
438 case '0': case '1': case '2': /* octal: \d \dd \ddd */
439 case '3': case '4': case '5': case '6': case '7':
440 n = c - '0';
441 if ((c = peek()) >= '0' && c < '8') {
442 n = 8 * n + input() - '0';
443 if ((c = peek()) >= '0' && c < '8')
444 n = 8 * n + input() - '0';
445 }
446 *bp++ = n;
447 break;
448
449 case 'x': { /* hex \x0-9a-fA-F + */
450 char xbuf[100], *px;
451 px = xbuf;
452 while ((c = input()) != 0 && px-xbuf < 100-2) {
453 if (isdigit(c) ||
454 (c >= 'a' && c <= 'f') ||
455 (c >= 'A' && c <= 'F'))
456 *px++ = c;
457 else
458 break;
459 }
460 *px = 0;
461 unput(c);
462 (void) sscanf(xbuf, "%x", (unsigned int *)&n);
463 *bp++ = n;
464 break;
465 }
466
467 default:
468 *bp++ = c;
469 break;
470 }
471 break;
472 default:
473 *bp++ = c;
474 break;
475 }
476 }
477 *bp = 0;
478 s = tostring(buf);
479 *bp++ = ' '; *bp++ = 0;
480 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
481 RET(STRING);
482 }
483
484
485 int
binsearch(char * w,Keyword * kp,int n)486 binsearch(char *w, Keyword *kp, int n)
487 {
488 int cond, low, mid, high;
489
490 low = 0;
491 high = n - 1;
492 while (low <= high) {
493 mid = (low + high) / 2;
494 if ((cond = strcmp(w, kp[mid].word)) < 0)
495 high = mid - 1;
496 else if (cond > 0)
497 low = mid + 1;
498 else
499 return (mid);
500 }
501 return (-1);
502 }
503
504 int
word(char * w)505 word(char *w)
506 {
507 Keyword *kp;
508 int c, n;
509
510 n = binsearch(w, keywords, sizeof (keywords) / sizeof (keywords[0]));
511 if (n != -1) { /* found in table */
512 kp = keywords + n;
513 yylval.i = kp->sub;
514 switch (kp->type) { /* special handling */
515 case BLTIN:
516 if (kp->sub == FSYSTEM && safe)
517 SYNTAX("system is unsafe");
518 RET(kp->type);
519 case FUNC:
520 if (infunc)
521 SYNTAX("illegal nested function");
522 RET(kp->type);
523 case RETURN:
524 if (!infunc)
525 SYNTAX("return not in function");
526 RET(kp->type);
527 case VARNF:
528 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
529 RET(VARNF);
530 default:
531 RET(kp->type);
532 }
533 }
534 c = peek(); /* look for '(' */
535 if (c != '(' && infunc && (n = isarg(w)) >= 0) {
536 yylval.i = n;
537 RET(ARG);
538 } else {
539 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
540 if (c == '(') {
541 RET(CALL);
542 } else {
543 RET(VAR);
544 }
545 }
546 }
547
548 void
startreg(void)549 startreg(void) /* next call to yylex will return a regular expression */
550 {
551 reg = 1;
552 }
553
554 int
regexpr(void)555 regexpr(void)
556 {
557 int c;
558 static char *buf = NULL;
559 static size_t bufsz = 500;
560 char *bp;
561
562 if (buf == NULL && (buf = (char *)malloc(bufsz)) == NULL)
563 FATAL("out of space for rex expr");
564 bp = buf;
565 for (; (c = input()) != '/' && c != 0; ) {
566 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
567 FATAL("out of space for reg expr %.10s...", buf);
568 if (c == '\n') {
569 *bp = '\0';
570 SYNTAX("newline in regular expression %.10s...", buf);
571 unput('\n');
572 break;
573 } else if (c == '\\') {
574 *bp++ = '\\';
575 *bp++ = input();
576 } else {
577 *bp++ = c;
578 }
579 }
580 *bp = 0;
581 if (c == 0)
582 SYNTAX("non-terminated regular expression %.10s...", buf);
583 yylval.s = tostring(buf);
584 unput('/');
585 RET(REGEXPR);
586 }
587
588 /* low-level lexical stuff, sort of inherited from lex */
589
590 char ebuf[300];
591 char *ep = ebuf;
592 char yysbuf[100]; /* pushback buffer */
593 char *yysptr = yysbuf;
594 FILE *yyin = NULL;
595
596 int
input(void)597 input(void) /* get next lexical input character */
598 {
599 int c;
600 extern char *lexprog;
601
602 if (yysptr > yysbuf)
603 c = (uschar)*--yysptr;
604 else if (lexprog != NULL) { /* awk '...' */
605 if ((c = (uschar)*lexprog) != 0)
606 lexprog++;
607 } else /* awk -f ... */
608 c = pgetc();
609 if (c == EOF)
610 c = 0;
611 if (ep >= ebuf + sizeof (ebuf))
612 ep = ebuf;
613 *ep = c;
614 if (c != 0) {
615 ep++;
616 }
617 return (c);
618 }
619
620 void
unput(int c)621 unput(int c) /* put lexical character back on input */
622 {
623 if (yysptr >= yysbuf + sizeof (yysbuf))
624 FATAL("pushed back too much: %.20s...", yysbuf);
625 *yysptr++ = c;
626 if (--ep < ebuf)
627 ep = ebuf + sizeof (ebuf) - 1;
628 }
629
630 void
unputstr(const char * s)631 unputstr(const char *s) /* put a string back on input */
632 {
633 int i;
634
635 for (i = strlen(s)-1; i >= 0; i--)
636 unput(s[i]);
637 }
638